def scrape_online_delv(pageNo):

    r = requests.get(
        "https://www.zomato.com/ncr/west-delhi-order-online?page=%d" % pageNo,
        cookies=cookie_jar,
        headers=headersUA)
    #request takes url as argument and gives string containing HTML
    soup = BS(r.text, "html.parser")
    my_divs = soup.find_all("div", {"class": "search-o2-card"})

    for div in my_divs:
        #name of the rest
        rstName = div.findChildren(
            "a", {"class": "result-order-flow-title"})[0].text.strip()

        #link to the direct page of the rest on zomato
        link = []
        link = div.findChildren("a", attrs={
            'href': re.compile("^https://")
        })  #re.compile helps in pattern matching
        rstLink = link[0].get('href')

        #rating of the rest
        if (div.findChildren("span", {"class": "rating-value"})):
            rstRating = div.findChildren(
                "span", {"class": "rating-value"})[0].text.strip()

            rstRating = float(rstRating)

        else:
            rstRating = 0.0

        #category of the rest
        rstCatg = div.findChildren("div",
                                   {"class": "grey-text"})[0].text.strip()

        #finding the offers available
        rstOffer = "No Offer"
        rstOfferValue = 0

        if (div.findChildren("span", {"class": "offer-text"})):
            rstOffer = div.findChildren(
                "span", {"class": "offer-text"})[0].text.strip()

            #if u"\u20b9" in rstOffer:
            #rstOfferValue = rstOffer[rstOffer.index(u"\u20b9")+1:rstOffer.index(" ")]
            if "%" in rstOffer:
                rstOfferValue = int((rstOffer[0:rstOffer.index("%")]).strip())

        #calling the func to calculate the rest score
        rstScore = scorecal(rstRating, rstOfferValue)
        #print(rstScore)

        rstInfo = dict()

        rstInfo['rstName'] = rstName
        rstInfo['rstRating'] = rstRating
        rstInfo['rstCatg'] = rstCatg
        rstInfo['rstOffer'] = rstOffer
        rstInfo['rstScore'] = rstScore
        rstInfo['rstLink'] = rstLink

        allRest.append(rstInfo)

        sortedAllRest = sorted(allRest,
                               key=lambda i: i['rstScore'],
                               reverse=True)

    return sortedAllRest
print("-------------")
with open('grayscale_scraping.csv', mode='w', newline='') as output_file:
    file_writer = csv.writer(output_file,
                             delimiter=',',
                             quotechar='"',
                             quoting=csv.QUOTE_MINIMAL)
    file_writer.writerow([
        'asset_name', 'aum', 'shares', 'asset_per_share', 'holdings_per_share',
        'market_per_share'
    ])

    for asset in urls:

        print(asset[0])
        response = scraper.get(asset[1]).text
        table = BS(response, "html.parser")
        overviewdata = table.find("table", {"class": "overview-data"})

        if (asset[0] == "ETC"):
            #AUM
            aum = overviewdata.findAll("tr")[9]
            aum = aum.findAll("td")[1].text
            aum = aum.replace("*", "")
            aum = aum.replace("‡", "")
            print("AUM: " + aum)

            #sharesoutstanding
            shares = overviewdata.findAll("tr")[10]
            shares = shares.findAll("td")[1].text
            shares = shares.replace("*", "")
            shares = shares.replace("‡", "")
reload(sys)
sys.setdefaultencoding("utf-8")

def change_code(sentence):
	s_list = sentence.split(" ")
	s_list = [unicode(x) for x in s_list]
	return " ".join(s_list)

data_file = sys.argv[1]
#data_file = "IndianHistory"
os.system('java -cp "../stanford-corenlp-full-2018-02-27/*" -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file ../dataset/'+data_file+'.txt')
# exit(0)
xml_path = "./"+data_file+".txt.xml"
xml_fd = open(xml_path, "r")
xml_file = xml_fd.read()
xml_soup = BS(xml_file, 'lxml')
data_fd = io.open("../dataset/"+data_file+".txt", encoding='utf-8')

data_lines = data_fd.readlines()
# new_data
# print data_lines[0]
data_fd.close()
for x in xml_soup.find_all('coreference'):
	structure_list = list(x.children)
	for s_items in structure_list:
		if s_items == "\n":
			continue
		else:
			mention_list = []
			s_items_list = list(s_items.children)
			# print len(s_items_list)
示例#4
0
def spider():
    browser = Browser()
    browser.visit('http://www.baidu.com')
    browser.execute_script(
        "window.location.href = 'http://bf.310v.com/3.html'")
    time.sleep(10)
    while True:
        import config
        reload(config)
        soup = BS(browser.html, 'html5lib')
        table = soup.select('table#idt')[0]
        a3_trs = table.find_all('tr', class_='a3')
        a4_trs = table.find_all('tr', class_='a4')
        a3_trs.extend(a4_trs)
        for tr in a3_trs:
            # 没有 style='display: none'
            if (not tr.has_attr('style')) and tr['id'].find('ad') == -1:
                time_td_text = tr.find_all('td')[3].get_text()  # 比赛时间所在的td
                match_id = tr['id']
                end_score = tr.find_all('td')[5].get_text()
                middle_score = tr.find_all('td')[7].get_text()
                match_news = News.objects.filter(match_id=match_id)

                if match_news:
                    if time_td_text.find(u'完') > -1:
                        for match_new in match_news:
                            match_new.end_score = end_score
                            match_new.middle_score = middle_score
                            match_new.save()
                    if time_td_text.find(u'中') > -1:
                        for match_new in match_news:
                            match_new.middle_score = middle_score
                            match_new.save()

                if re.match(r'\d+', time_td_text
                            ) and int(time_td_text) < config.STATUS_TIME:
                    num1_td = tr.find_all('td')[9]
                    num2_td = tr.find_all('td')[11]
                    yapan1 = num1_td.find_all('div')[0].get_text()
                    yapan2 = num2_td.find_all('div')[0].get_text()
                    daxiaopan1 = num1_td.find_all('div')[1].get_text()
                    daxiaopan2 = num2_td.find_all('div')[1].get_text()

                    tds = tr.find_all('td')
                    ftype = tds[1].find('font').get_text()  # 比赛类型
                    gamestarttime = tds[2].get_text()
                    gamestatus = time_td_text
                    team1 = tds[4].find_all('font')[2].get_text()
                    score = tds[5].get_text()
                    team2 = tds[6].find_all('font')[0].get_text()
                    halfscore = tds[7].get_text()

                    yapanSB = re.sub(r'\s', '',
                                     tds[10].find_all('div')[0].text)
                    daxiaopanSB = tds[10].find_all('div')[1].text

                    same_match_sep = datetime.datetime.now(
                    ) - datetime.timedelta(seconds=config.SAME_MATCH_SEP_TIME)
                    matchs = News.objects.filter(score=score).filter(
                        team1=team1).filter(team2=team2).filter(
                            create_time__gte=same_match_sep)
                    # print team1, team2, score, halfscore
                    for each in config.YAPAN:
                        if yapan1 == each.split(
                                '-')[0] and yapan2 == each.split('-')[1]:
                            # print each, yapan1, yapan2
                            if score != '0-0' and halfscore != '0-0' and len(
                                    matchs.filter(findex=each)) == 0:
                                try:
                                    winsound.PlaySound('nokia.wav',
                                                       winsound.SND_PURGE)
                                except:
                                    pass
                            news = News.objects.create(
                                match_type=ftype,
                                game_start_time=gamestarttime,
                                status=gamestatus,
                                team1=team1,
                                team2=team2,
                                half_score=halfscore,
                                score=score,
                                yapan=yapan1 + '-' + yapan2,
                                daxiaopan=daxiaopan1 + '-' + daxiaopan2,
                                findex=each,
                                match_id=match_id,
                                yapanSB=yapanSB,
                                daxiaopanSB=daxiaopanSB)
                            news.save()
                    for each in config.DAXIAOPAN:
                        if daxiaopan1 == each.split(
                                '-')[0] and daxiaopan2 == each.split('-')[1]:
                            # print each, daxiaopan1, daxiaopan2
                            if score != '0-0' and halfscore != '0-0' and len(
                                    matchs.filter(findex=each)) == 0:
                                try:
                                    winsound.PlaySound('nokia.wav',
                                                       winsound.SND_PURGE)
                                except:
                                    pass
                            news = News.objects.create(
                                match_type=ftype,
                                game_start_time=gamestarttime,
                                status=gamestatus,
                                team1=team1,
                                team2=team2,
                                half_score=halfscore,
                                score=score,
                                yapan=yapan1 + '-' + yapan2,
                                daxiaopan=daxiaopan1 + '-' + daxiaopan2,
                                findex=each,
                                match_id=match_id,
                                yapanSB=yapanSB,
                                daxiaopanSB=daxiaopanSB)
                            news.save()
        time.sleep(config.SPIDER_SEP_TIME)
utterance will begin with NO NON-DOM. If there is non-dominant hand gloss in the utterance there will be **NON-DOM** followed by the non-dominant hand 
gloss."""

from bs4 import BeautifulSoup as BS
import re

partial_path = """<write the path to "ncslgr-xml">"""  # Write the path location where ncslgr-xml is saved on your local machine

dominant_only_gloss = ()
dominant_and_non_dominant_gloss = ()

with open(partial_path + r'\football.xml', 'r') as f_IN:
    with open(
            """Path name to file output""", 'a'
    ) as f_OUT_utts:  # Write path to the file name you want to use to save the output to
        soup = BS(f_IN.read(), 'xml')

        for utterance_tag in soup.find_all('UTTERANCES'):
            for utterance_tags in utterance_tag.find_all('UTTERANCE'):
                if utterance_tags.find_all('TRACK', {'FID': '10001'}):
                    for dominant_track_tags in utterance_tags.find_all(
                            'TRACK', {'FID': '10000'}):
                        for dominant_a_tags in dominant_track_tags.find_all(
                                'A'):
                            if dominant_a_tags.has_attr('VID'):
                                dominant_a_tags.decompose()
                        for non_dominant_track_tags in utterance_tags.find_all(
                                'TRACK', {'FID': '10001'}):
                            for non_dominant_a_tags in non_dominant_track_tags.find_all(
                                    'A'):
                                if non_dominant_a_tags.has_attr('VID'):
示例#6
0
        if index != 0:
            wfundid, name = nameStr.split(',')
            if '-' in wfundid:
                wfundDict[wfundid] = name

    print('MMA境外基金數:{}'.format(len(wfundDict)))
    print('==================' * 2)

    ## 國內基金 ##
    fundidsList = list(fundDict.keys())
    ## 取得國內基金基本資料/經理人資料/持股狀況(個股/各分類) ##
    url_domestic_base = 'http://mmafund.sinopac.com/w/wr/'
    for no, fundid in enumerate(fundidsList):
        html_domestic_info = requests.get(url_domestic_base + 'wr01.djhtm?a=' +
                                          fundid).text
        soup_domestic_info = BS(html_domestic_info, "lxml")

        html_domestic_stock = requests.get(url_domestic_base +
                                           'wr04.djhtm?a=' + fundid).text
        soup_domestic_stock = BS(html_domestic_stock, "lxml")

        fundInfo_domestic = getFundBasicInfo(soup_domestic_info)
        fundManager_domestic = getFundManager(soup_domestic_info)

        fundStock_domestic = getDomesticStockHolding(soup_domestic_stock)
        fundShare_domestic = getDomesticShareHolding(html_domestic_stock)

        dictToDb(fundInfo_domestic, '[MMA國內基金基本資料]', con)
        dictToDb(fundManager_domestic, '[MMA國內基金歷任經理人]', con)

        dictToDb(fundStock_domestic, '[MMA國內基金持股狀況_個股]', con)
示例#7
0
def getForeignShareHolding(html_text_wb):
    """取得境外持股資料(圓餅圖)
    剖析mma中html含有js程式碼,資料隱藏在js其中
    params
    html_text : raw text(str)
    return : list of defaultdict
    """
    def getShareHoldingTable(stockGroupList):
        """轉換國外持股圓餅圖資料(getForeignShareHolding)為dict格式(pd.dataframe可直接使用)    
        """
        stockGroup = defaultdict(list)
        for index, (k, v) in enumerate(stockGroupList):

            if index > 1:
                stockGroup['項目'].append(k)
                stockGroup['投資金額(美元:萬)'].append(v)
            else:
                stockGroup[k] = v
        return stockGroup

    soup = BS(html_text_wb, "lxml")
    date_temp = soup.select('.wfb1ar')
    if date_temp:
        update_date = '/'.join(
            re.findall(r"\d+",
                       soup.select('.wfb1ar')[0].text))  ## 資料更新日期

        ### fundid ####
        fundid = re.findall(
            r"(?:a=)(.+)",
            soup.select('#itemTab')[0].find('a').get('href'))[0]
        fundid = fundid.strip()
        ###############

        string1 = 'DJGraphObj1'  # 切出目標字串
        target_text = html_text_wb[html_text_wb.index(string1):]

        pat1 = r"(?:\'Title\':)(.+\')(?:])"
        investTitle = re.findall(pat1, target_text)  # 取得並切分表單table

        pat2 = r"(?:\')(.*?)(?:\')"  # 取出包含在 ' '內的字串
        pat3 = r"(?:\'PieV\':)(.+)"  # 取出包含在 PieV 後的字串
        #     investTitleByStock = re.findall(pat2,investTitle[0]) ## 依產業標題(List)
        #     investTitleByStock
        table = defaultdict(list)
        tableAns = []
        # pdb.set_trace()
        for index, titleText in enumerate(investTitle):

            titleList = re.findall(pat2, titleText)
            if len(titleList) == 1:
                continue
            colname = titleList[0]
            titleList = titleList[1:]
            titleList.insert(0, 'fundid')
            titleList.insert(1, '資料日期')
            valueList = re.findall(pat2, re.findall(pat3, target_text)[index])
            valueList.insert(0, fundid)
            valueList.insert(1, update_date)

            table[colname] = list(zip(titleList, valueList))

            # typeName = ['持有類股','區域','產業'] # 沒在用啦!! 之前有錯~~
            share_Holding_Dict = getShareHoldingTable(table[colname])
            share_Holding_Dict['分類'] = re.findall(r"產業|持有類股|區域", colname)[0]

            tableAns.append(share_Holding_Dict)

        return tableAns
示例#8
0
import requests
from bs4 import BeautifulSoup as BS

search = input().split()
name = ''

for i in search:
    name = name + '+' + i

name = name[1:]

html_code = requests.get('https://www.citilink.ru/search/?text=' + name).text
soup = BS(html_code, 'lxml')

page = soup.find('div', {'class': "main_content_wrapper search"})
items = page.find('div', {'class': 'main_content_inner'})
items = items.find(
    'div', {
        'class':
        'block_data__gtm-js block_data__pageevents-js listing_block_data__pageevents-js'
    })

for item in items.findAll('div', {'class': 'subcategory-product-item__body'}):
    title = item.find('span', {'class': 'h3'})
    print(title.a.get('title'))
    print(title.a.get('href'))
    Text = item.find('p', {'class': 'short_description'}).text
    print(Text)
示例#9
0
import requests
from bs4 import BeautifulSoup as BS

result = requests.get("http://midas.iiitd.com/")
src = result.content
soup = BS(src, 'lxml')

print(soup.find_all('img'))
示例#10
0
文件: pdf_spider.py 项目: mxxhcm/code
def search(sess,
           load_time,
           pay_load,
           cookie,
           root_path,
           resourses_path,
           local_path,
           chrome=None,
           page=1):
    page_dir = local_path + "/page_" + str(page)
    if not os.path.exists(page_dir):
        os.mkdir(page_dir)

    try_times = 5
    flag = False
    try_num = 0
    while try_times != 0:
        load_page_flag = True
        page_error_count = 5
        while load_page_flag:
            try:
                indexes = sess.post(resourses_path,
                                    data=pay_load,
                                    cookies=cookie)
                load_page_flag = False
                page_error_count -= 1
            except Exception as e:
                load_page_flag = True
                if page_error_count == 0:
                    return None
                print("Error:", e)

        connect_error = "connect()连接127.0.0.1:6600失败,错误号:10061."
        html_contents = indexes.text
        # print(html_contents)
        if html_contents.find(connect_error) == 1:
            try_num += 1
            try_times -= 1
            print("Can't connect server, try ", try_num, "time(s).")
            time.sleep(30)
        else:
            flag = True
            break

    if not flag:
        print("Wait some minutes, try again.")
        return None

    time.sleep(load_page_time)
    soup = BS(indexes.text, "html.parser")
    soup.prettify()

    numbers = None
    if page == 1:
        pdf_numbers_info = soup.find("div", class_="search_gs")
        pdf_numbers_pattern = "([0-9]*)篇;"
        pdf_numbers = re.findall(pdf_numbers_pattern, str(pdf_numbers_info))
        pdf_numbers = pdf_numbers[0]
        if pdf_numbers != '':
            numbers = int(pdf_numbers)
            print("Total find ", numbers, "files")
        else:
            return None

    # 3.parser current page
    pattern = "downpaper\('(.*)'\);return false"
    results = soup.find_all("span", class_="down")
    print("Download page ", page, ".")
    for i, raw_link in enumerate(results):
        a = raw_link.find_all("a")
        if len(a) == 1:
            print("============================================")
            print("Current file don't have a download link.")
            print("")
            print("")
            continue
        link = re.findall(pattern, str(a[1]))
        download_url_website = link[0].replace("&amp;", "&")
        url = root_path + download_url_website

        pattern_title = "(&T)=(.*?)$"
        pattern_title_obj = re.compile(pattern_title)
        raw_title = re.findall(pattern_title_obj, url)[0][1]
        filename = parse.unquote(raw_title)
        filename = filename.replace("/", "")
        print("============================================")
        print("Preparing downloading: ", filename)
        file_path = page_dir + "/" + filename + ".pdf"
        if os.path.exists(file_path):
            print(filename, "already in dir : ", page_dir)
            print("")
            print("")
            continue

        # time.sleep(5)
        download_not_finish_flag = True
        error_count = 5
        while download_not_finish_flag:
            try:
                chrome.get(url)
                error_count -= 1
                download_not_finish_flag = False
            except Exception as e:
                print("Error:", e)
                if error_count == 0:
                    continue

        download_not_finish_flag = True
        error_count = 5
        while download_not_finish_flag:
            try:
                chrome.get(url)
                error_count -= 1
                download_not_finish_flag = False
            except Exception as e:
                print("Error:", e)
                if error_count == 0:
                    continue

        time.sleep(load_time)
        hrefs = chrome.find_elements_by_xpath("//*[@href]")

        find_url_flag = False
        for href in hrefs:
            if href.text == '下载地址' or href.text == '镜像站-高速下载-1':
                find_url_flag = True
                download_url = href.get_attribute('href')
                print("Download url : ", download_url)
                download(download_url,
                         sess,
                         cookies={},
                         filename=page_dir + "/" + filename)
        if not find_url_flag:
            print("not find")
    if page == 1:
        return numbers
示例#11
0
from bs4 import BeautifulSoup as BS
from time import time
import psycopg2
import requests
import config
import sys
import re

init_ts = time()

links = list(
    filter(lambda x: bool(x), [
        a.get('href', False)
        for a in BS(requests.get(f'http://{sys.argv[1]}').text,
                    features='html.parser').find_all('a')
    ]))

with open('links.txt', 'w+') as fh:
    fh.write('\n'.join(links))

connection = psycopg2.connect(config.CONNECTION_STRING)
cursor = connection.cursor()

for link in links:
    cursor.execute("INSERT INTO LINKS(HREF, DOMAIN) VALUES(%s, %s)", (
        link,
        sys.argv[1],
    ))
    print(f'INSERTED {link}')

connection.commit()
示例#12
0
    # check to see the status code of our request
    print("Status code = {}".format(response.status_code))

    # Save the data to the local cache but only if status code is good
    if response.status_code == 200:
        with open(cache_file, 'w') as file:
            file.write(response.text)
            # Store the data retrieved in memory as well
            cache_data = response.text
            print("Cache data saved to disk and memory")

# Now check the data is not empty before we carry on
if len(cache_data) > 0:
    print("cache data OK")
    data = BS(cache_data, 'html.parser')
    print("Filtering per rulesets")
    products = data.find('div', attrs={
        'class': 'products-list'
    }).find_all('div', attrs={'class': 'product-card'})
    print("{} matches found".format(len(products)))
    print("Extracting relevant data")

    for product in products:
        title = product.find('a', attrs={'class': 'product-card__title'})
        price = product.find('div',
                             attrs={'class': 'product-card__price-value'})

        # Add the product to the list of items found
        items.append(prod(title.text, str(price.text).strip()))
示例#13
0
 def parse(self, response):
     return {'title': BS(response.text).title.text}
示例#14
0
def find_login_param(onyma, login=None, account_name=None):
    url_ip = 'https://10.144.196.37'
    url_main = 'https://10.144.196.37/onyma/main/'
    try:
        if (account_name is None) and (login is not None):
            payload = {
                'prpoper1': 'Like',
                'prpv1': login,
                'prpc': '0',
                'search': 'Поиск'
            }
            html = onyma.post(
                'https://10.144.196.37/onyma/main/dogsearch_ok.htms',
                data=payload,
                verify=False).text
            if '<title>Результаты поиска</title>' in html:
                url = BS(html,
                         'lxml').find('a',
                                      title=re.compile('-.+руб.')).get('href')
                html = onyma.get(url_main + url).text

            url = BS(html,
                     'lxml').find('a', title=re.compile('Договор')).get('href')
            html = onyma.get(url_ip + url).text

            # Поиск учетного имени
            links = BS(html, 'lxml').find_all('a')
            for link in links:
                url = link.get('href')
                if 'clsrv.htms' in url:
                    html = onyma.get(url_main + url).text
                    if login in html:
                        account_name = re.search(
                            r'\]\. (\S+)',
                            BS(html,
                               'lxml').find('title').text).group(1).strip()
            url = BS(html, 'lxml').find('a', id='menu4185').get('href')
            html = onyma.get(url_ip + url).text
            url = url_main + BS(html, 'lxml').find(
                'td', class_='td1').find('a').get('href')
            html = onyma.get(url).text
        elif (login is None) and (account_name is not None):
            html = onyma.post(
                'https://10.144.196.37/onyma/main/dogsearch_ok.htms', {
                    'sitename': account_name,
                    'search': 'Поиск'
                },
                verify=False).text
            url = BS(html,
                     'lxml').find('a', title=re.compile('Договор')).get('href')
            html = onyma.get(url_ip + url).text
            url = BS(html, 'lxml').find('a', id='menu4185').get('href')
            html = onyma.get(url_ip + url).text
            url = url_main + BS(html, 'lxml').find(
                'td', class_='td1').find('a').get('href')
            html = onyma.get(url).text
        else:
            return False
        urls = []
        links = BS(html, 'lxml').find_all('a')
        for link in links:
            url = link.get('href')
            if ('service=201'
                    in url) or ('service=4610' in url) and (link.text
                                                            == account_name):
                urls.append(url_main + url)
    except:
        return False
    result_url = ''
    result_date = 1
    for url in urls:
        try:
            html = onyma.get(url).text
            current_date = int(
                BS(html,
                   'lxml').find('td',
                                class_='td1').find('a').text.split('.')[0])
        except:
            continue
        if current_date >= result_date:
            result_date = current_date
            result_url = url
    if result_url != '':
        bill = re.search(r'bill=(\d+)', result_url).group(1)
        dmid = re.search(r'dmid=(\d+)', result_url).group(1)
        tmid = re.search(r'tmid=(\d+)', result_url).group(1)
        return {
            'account_name': account_name,
            'bill': bill,
            'dmid': dmid,
            'tmid': tmid
        }
    elif account_name is not None:
        return {
            'account_name': account_name,
            'bill': None,
            'dmid': None,
            'tmid': None
        }
    else:
        return False
示例#15
0
文件: text.py 项目: littlepaike/LOL
def geteid(html):
    soup = BS(html, 'html.parser')
    return soup.find_all('option')
示例#16
0
    def recipe_finder(self, foodname, num_recipe=3):

        #get food name as keyword
        keyword = foodname

        #convert keyword into unicode
        keyword = str(
            keyword.encode('utf-8')).lstrip("b'").rstrip("'").replace(
                "\\x", "%")

        # get url
        url = 'http://www.10000recipe.com/recipe/list.html?q=' + keyword

        #connect to the site
        response = requests.get(url)

        html = response.text

        #parse using BS
        soup = BS(html, 'lxml')

        #get cookbook link

        if soup.find("a", "thumbnail") == None:
            recipe = ''
            recipe_list = []
        else:
            cook_link_list = soup.find_all('a', "thumbnail")

            cook_urls = []

            for cook_links in cook_link_list:
                cook_link = cook_links['href']
                cook_urls.append('http://www.10000recipe.com' + cook_link)
            cook_urls = cook_urls[0:num_recipe]

            #connect to the site
            recipe_list = []
            for cook_url in cook_urls:
                response = requests.get(cook_url)

                html = response.text

                #parse using BS
                soup2 = BS(html, 'lxml')
                contents = soup2.find_all('meta', {'name': "keywords"})

                #get recipe
                recipe_content = contents

                recipe = ''

                for rec in recipe_content:
                    recipe = recipe + str(rec['content'])

                #replace useless tokens
                #recipe = recipe.replace('text/html; charset=euc-kr','')
                #recipe = recipe.replace('\r\n','')
                #recipe = re.sub('http.+','',recipe)
                recipe_list.append(recipe)

        return recipe_list
示例#17
0
def get_page_data(html):  # парсинг игр
    soup = BS(html, 'lxml')

    items = soup.find_all('div', class_='item')
    for item in items:
        url = 'https://www.playground.ru' + item.find(
            'div', class_='media-heading title').find('a').get('href')
        soup = BS(get_html(url), 'lxml')

        gameCard = soup.find('div', class_='gp-game-card-top')

        try:
            name = gameCard.find('h1', class_='gp-game-title').text.strip()
            sp = name.split("  ")
            name = sp[0].strip()
        except:
            name = ''

        try:
            genres = gameCard.find('div', class_='genres').text.strip()
            genres = 'Жанры: ' + ", ".join(genres.split('\n\n'))

        except:
            genres = ''

        try:
            releaseList = gameCard.find('div', class_='releases').find_all(
                'div', class_='release-item')
        except:
            releaseList = ''
        release = 'Дата выхода:' + '\n'
        for i in releaseList:
            release += ' '.join(i.text.split()) + '\n'
        release = release.strip()

        try:
            info = soup.find('div', class_='description-wrapper').text.strip()
        except:
            info = ''

        try:
            info += '\nРазработчик: ' + gameCard.find(
                'div', class_='game-card-info js-redirect').find(
                    'span', itemprop="name").text.strip()
        except:
            pass

        try:
            info += '\nИздатель: ' + gameCard.find(
                'div', class_='game-card-info js-redirect').find(
                    'span', itemprop="publisher").text.strip()
        except:
            pass

        try:
            photo = soup.find('div',
                              class_='gp-game-cover').find('a').get('href')
        except:
            photo = ''

        data = {
            'name': name,
            'genres': genres,
            'release': release,
            'info': info,
            'photo': photo,
            'url': url
        }

        write_csv(data)
示例#18
0
 def go(self, url):
     self.phantom.get(url)
     self.soup = BS(self.phantom.page_source, 'lxml')
示例#19
0
def getDomesticShareHolding(html_text):
    """取得國內持股資料(圓餅圖)
    剖析mma中html含有js程式碼,資料隱藏在js其中
    params
    html_text : raw text(str)
    return : list of defaultdict
    """
    def getShareHoldingTable(stockGroupList):
        """轉換國內持股圓餅圖資料(getDomesticShareHolding)為dict格式(pd.dataframe可直接使用)    
        """
        stockGroup = defaultdict(list)
        for index, (k, v) in enumerate(stockGroupList):

            if index > 1:
                stockGroup['項目'].append(k)
                stockGroup['投資金額(萬元)'].append(v)
            else:
                stockGroup[k] = v
        return stockGroup

    ### 取得 fundid ###
    soup = BS(html_text, "lxml")
    fundid = re.findall(r"(?:a=)(.+)",
                        soup.select('#itemTab')[0].find('a').get('href'))[0]
    # print('fundid:{}'.format(fundid))
    ##### 取得 資料日期 ####
    date_temp = soup.select('.wfb1ar')
    if date_temp:
        try:
            update_dateStr = re.findall(
                r"\d+\/\d+\/\d+",
                soup.select('.wfb1ar')[-1].text)[0]  # xx分布--資料日期
        except IndexError:
            update_date = re.findall(r'\d+/\d+', date_temp[-1].text)[0]  ## 年/月
        #######################
        string1 = 'DJGraphObj1'  # 切出目標字串
        target_text = html_text[html_text.index(string1):]

        pat1 = r"(?:\'Title\':)(.+)"
        investTitle = re.findall(pat1, target_text)  # 取得並切分表單table

        pat2 = r"(?:\')(.*?)(?:\')"  # 取出包含在 ' '內的字串
        pat3 = r"(?:\'PieV\':)(.+)"  # 取出包含在 PieV 後的字串

        table = defaultdict(list)
        tableAns = []
        for index, titleText in enumerate(investTitle):
            titleList = re.findall(pat2, titleText)
            if len(titleList) == 1:
                continue
            colname = titleList[1]
            titleList = titleList[2:]
            titleList.insert(0, 'fundid')
            titleList.insert(1, '資料日期')
            valueList = re.findall(pat2, re.findall(pat3, target_text)[index])
            valueList.insert(0, fundid)
            valueList.insert(1, update_dateStr)
            # print(titleList,valueList)
            table[colname] = list(zip(titleList, valueList))

            share_Holding_dict = getShareHoldingTable(table[colname])

            # typeName = ['持有類股','區域','產業'] ## 沒用了,之前有錯!
            share_Holding_dict['分類'] = re.findall(r"產業|持有類股|區域", colname)[0]

            tableAns.append(share_Holding_dict)
        return tableAns
示例#20
0
 def search(self, css_select, key_text, parser='lxml'):
     b = BS(self.html(), 'lxml')
     for module in b.find_all(text=re.compile(key_text)):
         path = self._get_absolute_path(module.parent)
         if css_select in path:
             return path
示例#21
0
    home_page_url = 'http://portal.chd.edu.cn/index.portal?.pn=p167'
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36"
    }
    cookies = login(login_url, headers, home_page_url)  #此函数最终返回一个cookies
    #print(type(cookies))
    #print(cookies['JSESSIONID'])
    #print(cookies)
    session = requests.session()  # 此方法可以保存服务器发来的cookies
    url = 'http://bkjw.chd.edu.cn/eams/teach/grade/course/person!historyCourseGrade.action?projectType=MAJOR'
    session.headers = headers
    session.cookies = cookies  # 手动加载返回的cookies
    res = session.get(url)

    soup = BS(res.text, 'html.parser') # 在 Android 上未能安装lxml,就换用html.parser了
    text1 = soup.findAll(class_=re.compile('griddata'))
    st0 = ['学年度', '学期', '门数', '总学分', '平均绩点']
    for i in text1:
        if len(i.contents) == 3:
            print(i.get_text())
            continue
        elif len(i.contents) == 9:
            st = text1[4].get_text().split('\n')[1:5]
            print(st[0] + ":")
            print(st0[2] + ": " + st[1])
            print(st0[3] + ": " + st[2])
            print(st0[4] + ": " + st[3])
            continue
        st = i.get_text().split('\n')[1:6]
        for j in range(5):
示例#22
0
from bs4 import BeautifulSoup as BS
import time, re, os, pickle, sys

tags = ["shiny", "shinyapps", "shiny-server", "shinydashboard"]
wd = "/home/tian/shinyExpert/StackOverflow/"

topics = []
for tag in tags:
    path = wd + tag + "/"
    print path
    HTML = os.listdir(path)
    HTML.sort()
    for i in range(0, len(HTML)):
        f = HTML[i]
        bs = BS(open(path + f).read())
        threads = [f]
        try:
            Q = bs.find("div", {"id": "question"})
            Q_text = Q.find("div", {"class": "post-text"})
            q_text = Q_text.get_text()
            Q_time = Q.findAll("div", {"class": "user-action-time"})
            q_time = Q_time[0].find("span")["title"]
            Q_name = Q.findAll("a", {"href": re.compile("/users/.*")})
            q_name = Q_name[len(Q_name) - 1].get_text()
            Q_cmnt = Q.findAll("tr", {"class": "comment"})
            cmnts = []
            if (len(Q_cmnt) > 0):
                for c in range(0, len(Q_cmnt)):
                    c_text = Q_cmnt[c].find("span", {
                        "class": "comment-copy"
                    }).get_text()
示例#23
0
base_url = 'https://djinni.co/jobs/?lang=uk&location=%D0%9A%D0%B8%D0%B5%D0%B2&' \
           'page=1&primary_keyword=Python'

domain = 'https://djinni.co'
jobs = []
urls = []

urls.append(base_url)
urls.append(base_url + '&page=2')
urls.append(base_url + '&page=3')

for url in urls:
    time.sleep(1)
    req = session.get(url, headers=headers)
    if req.status_code == 200:
        bsObj = BS(req.content, "html.parser")
        li_list = bsObj.find_all('li', attrs={'class': 'list-jobs__item'})
        for li in li_list:
            div = li.find('div', attrs={'class': 'list-jobs__title'})
            title = div.a.text
            href = div.a['href']
            short = "No Description"
            #company = 'No name'
            descr = li.find('div', attrs={'class': 'list-jobs__description'})
            if descr:
                short = descr.p.text
            jobs.append({
                'href': domain + href,
                'title': title,
                'descript': short,
                'company': "No name"
示例#24
0
def get_metadata(headers, departments):
    s = VideoClient(headers)
    logger.info("Getting metadata")
    vid_list = s.get(VIDEO_LIST_URL)
    vid_list_bs = BS(vid_list, features="lxml")

    logger.info("Parsing department list")
    dept_select = vid_list_bs.find("select", id="dep_id").findAll("option")
    dept_names = {el["value"]: el.text
                  for el in dept_select
                  if el["value"]}  # get rid of empty dept

    logger.info("Parsing course list JSON")
    metadata = vid_list_bs.findAll("script", type="text/javascript", src=None)
    metadata = '\n'.join(str(i)
                         for i in metadata)  # resistant to extra <script> tags
    metadata = [i for i in metadata.splitlines() if "JSON.decode" in i][0]

    metadata = metadata[metadata.index("{"):metadata.rindex("}") +
                        1]  # bounds of actual json
    metadata = json.loads(metadata)

    logger.info("Scraping Video List Request Format")
    video_data = video_post_data(s)

    # the metadata format is:
    # dept num: {course num: {course name, course num}}

    # desired format is
    # dept num: {
    #   text: dept name,
    #   thumbnail: dep thumb,
    #   courses: {course num: {
    #       text: course name,
    #       thumbnail: course thumb,
    #       videos: { id : {url, data} }
    # }}}
    # because this will be yamled easily

    def get_department(dep, client):
        if dep not in dept_names:
            dept_names[dep] = f"{dep} - Uncategorized"
        courses = metadata[dep]

        # thumbs = {}
        course_metadata = {}
        for c in courses:
            videos = get_videos(client, video_data, dep, c)
            if videos == None:
                continue

            course_metadata[c] = {
                "text": html.unescape(courses[c]["text"]),
                "videos": videos,
                # "thumbnail": thumbnail,
            }

            # thumbs[thumb_date] = thumbnail
        data = {
            "text": html.unescape(dept_names[dep]),
            "courses": course_metadata,
            # "thumbnail": thumbs[max(thumbs)],
        }
        return dep, data

    logging.info("Departments: %s", sorted(metadata.keys()))

    if departments == []:
        departments = metadata

    departments = [i for i in departments if i in metadata]
    clients = [VideoClient(headers) for i in range(len(departments))]

    sane_data = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=len(departments) +
                                               4) as executor:
        futures = executor.map(get_department, departments, clients)
        for dep, data in futures:
            logging.info("Done scraping %s", dep)
            sane_data[dep] = data

    return sane_data
示例#25
0
    def handle_data(self, data):
        if self._recording == "State2":
            self.code += data

    def handle_endtag(self, tag):
        if tag == "pre" and self._recording == "State2":
            self._recording = "State1"
        elif tag == "div" and self._recording == "State1":
            self._recording = "State0"


if __name__ == "__main__":
    team_addr = f"http://{sys.argv[1]}:5000"
    search_resp = requests.get(team_addr + "/search",
                               params={"query": "get_flag"})
    soup = BS(search_resp.text, "html.parser")
    for article in soup.find_all(name="article",
                                 attrs={"class": "media content-section"}):
        title: str = article.find(name="a", attrs={
            "class": "article-title"
        }).text
        lang = title.split()[0]
        code_tag: Tag = article.find(name="div", attrs={"class": "highlight"})
        code = CodeGetter(str(code_tag)).code
        if lang == "Python":
            flag_repr = re.search(r"\[[\d, ]+\]", code)
            if flag_repr:
                flag_repr = literal_eval(flag_repr.group())
                key = re.search(r"chr\(x \^ (\d+)\)", code)
                if key:
                    key = int(key.group(1))
示例#26
0
import os
from bs4 import BeautifulSoup as BS

docpath = r'E:\Users\yuyun\Desktop\workspace\TempJob\650'
files = os.listdir(docpath)

# namedict = {}
names = []
for i in files:

    filepath = os.path.join(docpath, i)
    print(filepath)
    with open(filepath, 'r', encoding='gbk') as fp:
        text = fp.read()
        bsobj = BS(text, 'html.parser')
        find = bsobj.find_all('tr')
        for j in find:
            tr = j.find_all('td')
            if len(tr) != 10:
                continue
            elif tr[-2].text == chr(8730):
                names.append(tr[-4].text)
            else:
                continue

    # namedict[filepath] = names
names = set(names)
print(names)
print(len(names))
import os
import urllib.request
from bs4 import BeautifulSoup as BS
import requests

print("Your current path: " + os.getcwd())
new_path = input("Enter Your New Path")
os.chdir(new_path)
l = int(input('first comic index( >= 39)'))
h = int(input('last comic index( <= 4537)'))
for i in range(l, h + 1):
    url = 'http://explosm.net/comics/' + str(i)
    src_code = requests.get(url)
    code = src_code.text
    soup = BS(code, 'html.parser')
    for img in soup.find_all('img', {'id': 'main-comic'}):
        img_url = 'http:' + img.get('src')
        print(img_url)
        img_url = img_url.strip()
        try:
            urllib.request.urlretrieve(img_url, 'Comic' + str(i) + '.jpeg')
        except:
            print('Cant download this...Skipping')
示例#28
0
文件: text.py 项目: littlepaike/LOL
def getfilename(html):
    soup = BS(html, 'html.parser')
    headers = soup.find('select', class_='').text
    content = remove_all(headers.split('\n'))
    return content
示例#29
0
文件: bs.py 项目: exefncs2/LU_BO_JIA
htm = '''<html><head><title>國立臺灣大學系統</title></head>
<body>
<p class="title"><b>三校聯盟 NTU SYSTEM</b></p>
<p class="ntu_system">
<a href="http://www.ntu.edu.tw" class="union" id="link1">臺灣大學</a>
<a href="http://www.ntnu.edu.tw" class="union" id="link2">臺灣師範大學</a>
<a href="http://www.ntust.edu.tw" class="union" id="link3">臺灣科技大學</a>
</p></body></html>
'''

from bs4 import BeautifulSoup as BS
soup = BS(htm, "html.parser")
A1 = soup.title
A2 = soup.find("a")  #<a>
A3 = soup.find("b")  #<b>
A4 = soup.find_all("a", {"class": "union"})
web = soup.find("a", {"id": "link1"})
data = soup.select(".union")  #list[]
B = soup.select("#link3")  #list[]
print(A1)
print("*" * 50)
print(A2)
print("*" * 50)
print(A3)
print("*" * 50)
print(A4)
print("*" * 50)
print(web.get("href"))  #常用GET網址
print("*" * 50)
for i in data:
    print(i)  #data[0~n]
def scrape_dine_out(pageNo):

    global dineOffer
    global dineOfferValue

    r = requests.get(
        "https://www.zomato.com/ncr/west-delhi-restaurants?table_booking=1&page=%d"
        % pageNo,
        cookies=cookie_jar,
        headers=headersUA)
    #print(r.text)

    soup = BS(r.text, "html.parser")
    my_divs = soup.find_all("div", {"class": "search-snippet-card"})
    for div in my_divs:

        #name of the rest
        dineName = div.findChildren("a",
                                    {"class": "result-title"})[0].text.strip()

        #direct link to the rest page on zomato
        link = []
        link = div.findChildren("a", attrs={
            'href': re.compile("^https://")
        })  #re.compile helps in pattern mathcing
        dineLink = link[0].get('href')

        #rating of the rest
        if (div.findChildren("span", {"class": "rating-value"})):

            dineRating = div.findChildren(
                "span", {"class": "rating-value"})[0].text.strip()
            dineRating = float(dineRating)

        else:
            dineRating = 0.0

        #category of the rest
        dineCatg = div.findChildren("span",
                                    {"class": "col-m-12"})[0].text.strip()

        #finding the offers available
        if (div.findChildren("a", {"class": "zgreen"})):

            dineOffer = div.findChildren("a",
                                         {"class": "zgreen"})[0].text.strip()

            if "%" in dineOffer:
                dineOfferValue = int(
                    (dineOffer[0:dineOffer.index("%")]).strip())

        #open timings
        dineTime = div.findChildren("div",
                                    {"class": "col-s-11"})[0].text.strip()

        #calling the function to calculate the rest score
        dineScore = scorecal(dineRating, dineOfferValue)
        #print(rstScore)

        dineInfo = dict()

        dineInfo['dineName'] = dineName
        dineInfo['dineRating'] = dineRating
        dineInfo['dineCatg'] = dineCatg
        dineInfo['dineOffer'] = dineOffer
        dineInfo['dineTime'] = dineTime
        dineInfo['dineScore'] = dineScore
        dineInfo['dineLink'] = dineLink

        allDine.append(dineInfo)

        sortedAllDine = sorted(allDine,
                               key=lambda i: i['dineScore'],
                               reverse=True)

    return sortedAllDine