Пример #1
0
def get_results(moss_url):
    resp = r.get(moss_url)
    soup = BeautifulSoup(resp.content.decode('utf-8'), 'html5lib')

    ps = soup('p')
    name = None
    row = soup.table('tr')[1:2][0]
    first, second, lines = map(lambda x: x.text, row('td'))
    line, per = first.split()
    # Regex must be optimized, but this will work...
    m = re.match(r".*/([a-z- ]*)/[A-Za-z_öë ]*\.[a-z]+", line)
    if m:
        if m.groups():
            name = '_'.join(m.groups())

    if not name:
        name = 'moss_%s' % moss_url[33:]

    matches = []

    for row in soup.table('tr')[1:]:
        first, second, lines = map(lambda x: x.text, row('td'))
        first = parse_col(first)
        second = parse_col(second)
        lines = int(lines)
        url = row.a['href']
        matches.append(Match(first, second, lines, url))

    fil = Filter()
    matches = list(filter(fil.include, matches))

    return Results(name, matches)
Пример #2
0
 def userinfo(self, desc):
     if desc.find('?reset=1') != -1:
         desc = self.user.get(Gamer.url + '?reset=1')
     desc = BeautifulSoup(desc, 'html.parser')
     self.depth = int(desc.h2.text.split()[-1])
     info = desc.table('tr')[2]('td')
     self.level = int(info[0].text)
     self.hp = int(info[1].text)
     self.xp = int(info[2].text)
     self.weapon = Weapon(info[3].text)
     self.inventory = self.get_inventory(desc.table('tr')[3].text)
     return desc
Пример #3
0
def page_of_data(i):
    page_no = str(i)
    url_base = 'http://mobile311.sfgov.org/'
    url_ext = '?page=' + page_no + '&' + category
    url = url_base + url_ext + '&status=open'
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page.read(), 'lxml')
    #get report numbers
    reports = soup.table('span', "activity-timestamp")
    #get details from second page
    #should modify code to also get location information
    for line in reports:
        line = str(line)
        x = line.find("#") + 1
        y = x + 7
        z = line[x:y]
        #print z
        url_goal = url_base + "reports/" + z
        print url_goal  # Debugging maybe comment out this line
        page2 = urllib2.urlopen(url_goal)
        real_soup = BeautifulSoup(page2.read())
        blockquote = real_soup('blockquote')
        for lne in blockquote:
            request_type = lne.find_next_sibling('p')
            #print request_type
            if 'Fire hydrant' in str(request_type):
                print url_goal
                print blockquote
                pane = real_soup("div", "tab-pane active")
                #kids = real_soup.findchildren("div","tab-pane active")
                for ln in pane:
                    #kids = real_soup.findchildren("div","tab-pane active")
                    print ln
                    print "	kids**********************************"
Пример #4
0
def page_of_data(i):
	page_no =str(i)
	url_base = 'http://mobile311.sfgov.org/'
	url_ext = '?page='+page_no+'&service_id=518d5892601827e3880000c5' # street and sidewalk cleaning
	url= url_base+url_ext+'&status=open'
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read(),'lxml')
	#get report numbers
	reports = soup.table('span',"activity-timestamp")
	#get details from second page
	#should modify code to also get location information
	for line in reports:
		line=str(line)
		x=line.find("#")+1
		y=x+7
		z=line[x:y]
		#print z
		url_goal = url_base+"reports/"+z
		#print url_goal
		page2 = urllib2.urlopen(url_goal)
		real_soup = BeautifulSoup(page2.read())
		blockquote = real_soup('blockquote')
		for lne in blockquote:
			request_type = lne.find_next_sibling('p') 
			#print request_type
			if 'Human / Animal Waste'in str(request_type):
				print url_goal
				print blockquote
				pane = real_soup("div","tab-pane active")
				#kids = real_soup.findchildren("div","tab-pane active")
				for ln in pane:
					#kids = real_soup.findchildren("div","tab-pane active")	
					print ln	
					print "	kids**********************************"
def trade_spider():
    list_of_contracts = []
    page = 1
    last_page = max_page()
    while page <= last_page:

        url = "http://fcpir.ru/participation_in_program/contracts/?PAGEN_1=" +str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, features="html.parser")
        for link in soup.table("a"):
            if link.get("href").startswith("#"):
                continue
            else:
                href = "http://fcpir.ru" + link.get("href")         # Get link of project
                etap = get_single_item_data(href)                   # Get dictionary key - number of project
                for key, value in etap.items():                     # value - list of etaps
                    if len(value) < 1:
                        continue
                    else:
                        for v in value:
                            list_of_contracts.append([key, v])



        page+=1
    return list_of_contracts
Пример #6
0
def getTable(handler):

    tagStart = '<table'
    tagStop = '</table>'

    indexStart = handler.find(tagStart)
    newString = handler[indexStart:]

    indexStop = newString.find(tagStop)
    newString = newString[:indexStop + len(tagStop)]

    soup = BeautifulSoup(newString, 'html.parser')
    rows = soup.find_all('tr')

    table = []

    for i in range(0, len(rows) - 1):
        table.append([td.text for td in soup.table('tr')[i]('td')])

    for j in range(len(table)):
        for i in range(len(table[0])):
            if i == 5 and j > 0 or i == 6 and j > 0 or i == 7 and j > 0:
                regex = re.sub('\d\d.\d\d.\d\d$', '', table[j][i])
                try:
                    table[j][i] = float(regex)
                except:
                    try:
                        regex = regex.replace(',', '.')
                        table[j][i] = float(regex)
                    except:
                        table[j][i] = regex

    return (table)
Пример #7
0
def get_results(moss_url, name=None):
    if args.verbose >= 1:
        print(f"Getting {moss_url}")
    resp = r.get(moss_url)
    soup = BeautifulSoup(resp.content.decode('utf-8'), 'html5lib')

    if name is None:
        ps = soup('p')
        if len(ps) >= 2:
            name = ps[2].text.strip()
        if not name:
            name = 'moss_%s' % date_str()

    matches = []

    for row in soup.table('tr')[1:]:
        first, second, lines = map(lambda x: x.text, row('td'))
        first = parse_col(first)
        second = parse_col(second)
        lines = int(lines)
        url = row.a['href']
        matches.append(Match(name, first, second, lines, url))

    fil = Filter()
    matches = list(filter(fil.include, matches))

    return Results(name, matches)
Пример #8
0
def page_of_data(i):
	page_no =str(i)
	url_base = 'http://mobile311.sfgov.org/'
	url_ext = '?page='+page_no+'&'+category
	url= url_base+url_ext+'&status=open'
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read(),'lxml')
	#get report numbers
	reports = soup.table('span',"activity-timestamp")
	#get details from second page
	#should modify code to also get location information
	for line in reports:
		line=str(line)
		x=line.find("#")+1
		y=x+7
		z=line[x:y]
		#print z
		url_goal = url_base+"reports/"+z
		print url_goal # Debugging maybe comment out this line
		page2 = urllib2.urlopen(url_goal)
		real_soup = BeautifulSoup(page2.read())
		blockquote = real_soup('blockquote')
		for lne in blockquote:
			request_type = lne.find_next_sibling('p') 
			#print request_type
			if 'Fire hydrant'in str(request_type):
				print url_goal
				print blockquote
				pane = real_soup("div","tab-pane active")
				#kids = real_soup.findchildren("div","tab-pane active")
				for ln in pane:
					#kids = real_soup.findchildren("div","tab-pane active")	
					print ln	
					print "	kids**********************************"
Пример #9
0
def get_results(moss_url):
    resp = r.get(moss_url)
    soup = BeautifulSoup(resp.content.decode('utf-8'), 'html5lib')

    ps = soup('p')
    name = None
    if len(ps) >= 2:
        name = ps[2].text.strip()
    if not name:
        name = 'moss_%s' % date_str()

    matches = []

    for row in soup.table('tr')[1:]:
        first, second, lines = map(lambda x:x.text, row('td'))
        first = parse_col(first)
        second  = parse_col(second)
        lines = int(lines)
        url = row.a['href']
        matches.append(Match(first, second, lines, url))

    fil = Filter()
    matches = list(filter(fil.include,matches))

    return Results(name, matches)
Пример #10
0
def compare_files(request, testcaseresultid):
    '''Function to visually compare expected output and actual output'''
    testcaseresult = get_object_or_404(TestcaseResult, pk=testcaseresultid)
    expected_output = os.path.join(settings.MEDIA_ROOT,
                                   testcaseresult.test_case.output_files.path)
    test_input = os.path.join(settings.MEDIA_ROOT,
                              testcaseresult.test_case.input_files.path)
    actual_output_tar = os.path.join(settings.MEDIA_ROOT,
                                     testcaseresult.output_files.path)
    input_lines = "\n".join(
        read_file(name=test_input,
                  readthis=testcaseresult.test_case.std_in_file_name))
    soup = BeautifulSoup(difflib.HtmlDiff().make_file(
        read_file(name=expected_output,
                  readthis=testcaseresult.test_case.std_out_file_name),
        read_file(name=actual_output_tar)))
    for row in soup.find('table').findAll('tr'):
        for col in row.find_all('td'):
            if col.has_attr('nowrap'):
                del col['nowrap']
    soup.table.tbody.insert_before(soup.new_tag("thead"))
    soup.table.thead.append(soup.new_tag("th"))
    for s in [
            'Line Number', 'Expected Output', None, 'Line Number',
            'Actual Output'
    ]:
        new_tag = soup.new_tag("th")
        if s:
            new_tag.string = s
        soup.table.thead.append(new_tag)
    soup.table = soup.find("table", {"rules": "groups"})
    soup.table['width'] = "100%"
    soup.table.insert_after(soup.new_tag('br'))
    new_tag = soup.new_tag("style", type='text/css')
    soup.style.insert_after(new_tag)
    new_tag.append(
        " table {border-collapse:collapse; table-layout:fixed;}table td {border:solid 1px; "
        "width:100px; word-wrap:break-word;} table th{border:solid 1px;text-align:center;}"
    )
    new_tag_style = soup.new_tag("style", type='text/css')
    new_tag.insert_after(new_tag_style)
    new_tag_style.append("td.diff_header {text-align:center}")
    for new_tag in soup.find_all('colgroup'):
        new_tag.extract()
    colgroup_tag = soup.new_tag('colgroup')
    soup.thead.insert_before(colgroup_tag)
    for w in ['2%', '8%', '40%', '2%', '8%', '40%']:
        colgroup_tag.append(soup.new_tag('col', width=w))
    assignment = testcaseresult.test_case.program.assignment
    return render_to_response("evaluate/fileComparison.html", {
        'course': assignment.course,
        'assignment': assignment,
        'tst': testcaseresult,
        'inp': input_lines,
        'table': str(soup),
        'error_msg': error_msg
    },
                              context_instance=RequestContext(request))
Пример #11
0
def get_from_html(html):
	soup = BeautifulSoup(html, 'html.parser')
	rows = soup.table('tr', recursive=False)
	away_rows = rows[2].table('tr', recursive=False)[2:]
	home_rows = rows[4].table('tr', recursive=False)[2:]
	
	print "Home Team"
	process_rows(home_rows)
	print "Away Team"
	process_rows(away_rows)
Пример #12
0
def down_table(url):
    driver.get(url)
    source = driver.page_source
    soup = Soup(source, 'lxml')
    table = Soup(str(soup.table()), 'lxml')

    table_list = list(table.find_all('td'))
    final_list = list(
        map(
            lambda x: str(x).split(r'<')[int(
                (len(str(x).split(r'<')) + 1) / 2) - 1].split(r'>')[-1],
            table_list))

    col_even1 = soup.find_all('tr', class_="even")[0]
    judge_a = Soup(str(col_even1), 'lxml')
    judge_list = list(judge_a.find_all('td'))
    judge_list = list(
        map(
            lambda x: str(x).split(r'<')[int(
                (len(str(x).split(r'<')) + 1) / 2) - 1].split(r'>')[-1],
            judge_list))
    if len(judge_list) != 19:
        pad = ['--']
        dif = 19 - len(judge_list)
        final_list[19 + len(judge_list):19 + len(judge_list)] = pad * dif
    name = re.findall(r'<h2>.+?</h2>',
                      source)[0].split(r'<')[-2].split(r'>')[1]
    book = Workbook()
    sheet1 = book.active
    sheet1.title = "电影信息"
    sheet1.merge_cells('A1:S1')
    sheet1.cell(row=1, column=1, value=name)
    head = ['时间', '网票', '哈票', '万达', '金逸', '淘电影', '星美']
    for i in range(len(head)):
        if i == 0:
            sheet1.cell(row=2, column=i + 1, value=head[i])
        else:
            sheet1.cell(row=2, column=3 * i - 1, value=head[i])
    sheet1.merge_cells('B2:D2')
    sheet1.merge_cells('E2:G2')
    sheet1.merge_cells('H2:J2')
    sheet1.merge_cells('K2:M2')
    sheet1.merge_cells('N2:P2')
    sheet1.merge_cells('Q2:S2')
    for i in range(int(len(final_list) / 19)):
        for j in range(19):
            sheet1.cell(row=i + 3, column=j + 1, value=final_list[19 * i + j])
    book.save(name + '.xlsx')
    splitcri = u'每日票房数据统计'
    return name.split(splitcri)[0]
Пример #13
0
def get_from_html(html):
	soup = BeautifulSoup(html, 'html.parser')
	data_table = soup.table('tr', recursive=False)[7].table
	# .tag is equal to .find('tag').  ('tag') is equal to .find_all('tag').
	# below line is equivalent to above line, for example
	# data_table = soup.find('table').find_all('tr', recursive=False)[7].find('table')
	
	# start with away team
	away = True
	
	# display a header to make output more readable
	print 'T P# P Player Name            G A P +- PN PIM   TOT SHF   AVG    PP    SH    EV  S AB MS TH GV TK BS FW FL  F%'
	# first two rows just contain headings
	for tr in data_table('tr', recursive=False)[2:]:
		try:
			# rows starting with integers (player numbers) have relevant data
			int(tr.td.text)
		except ValueError:
			# Once we hit rows that don't start with numbers
			# we are at the summary lines between teams
			# and further player data will be for the home team
			away = False
			# since this line is invalid, go to next line instead of doing below processing
			continue
			
		(
			num, pos, name, g, a, p, pm, pn, pim,
			tot, shf, avg, pp, sh, ev,
			s, ab, ms, th, gv, tk, bs, fw, fl, fp
		) = [td.text.strip() for td in tr('td')]
		
		# name must be encoded in utf8 to ensure display of accented E
		name = name.encode('utf8')
		
		# could convert +/- values into numbers
		# when empty, throws an error, so could overwrite to
		# preserve being empty, or just save as 0
		try:
			pm = int(pm)
		except:
			pm = ''
		
		print '{:1} {:>2} {:1} {:22.22} {:1} {:1} {:1} {:>2} {:>2} {:>3} {:>5} {:>3} {:>5} {:>5} {:>5} {:>5} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>3}'.format(
			'A' if away else 'H',
			num, pos, name.encode('ascii','replace'), g, a, p, pm, pn, pim,
			tot, shf, avg, pp, sh, ev,
			s, ab, ms, th, gv, tk, bs, fw, fl, fp
		)
Пример #14
0
def predej_dluzne(evidence, db, vypis, sumplus, summinus, pocet,
                    csv_nejpozdeji):
    #jirkovo = nacti_jirkovo_ze_souboru('jirkovo.html')
    br = sa_login("Mirek Zv.", "miiirek1+1")
    sleep(2)
    jirkovo = br.open(url_zakaznici).read()
    vfp.strtofile(jirkovo, os.path.join(os.getcwd(),
                  'applications', 'platby', 'downloads', 'zakaznici.html'))
    # mírná duplicita v controllers/platby.py, kde tento soubor parsuji
    #   ke zjištění aktuální zálohy
    soup = BeautifulSoup(jirkovo)
    for zakaznik in soup.table('tr'):
        sloupce = zakaznik('td')
        if len(sloupce):   # první řádek (hlavička) totiž <td> nemá
            planovano = unformat_castka(sloupce[-1].string)
            neuhrazeno = unformat_castka(sloupce[-2].string)
            zaloha = unformat_castka(sloupce[-4].string)
            chybi = planovano + neuhrazeno - zaloha
            if chybi>0:  
                symbol = str(sloupce[0].a.string).strip().lstrip('0')
                wk_zakaznik = db(db.auth_user.ss==symbol).select().first()
                if wk_zakaznik and wk_zakaznik.zaloha>0:
                    jeste_chybi = chybi - evidence.get(wk_zakaznik.id, 0)
                      # minus kolik jsme mu právě vyplatili v predej_planovane()
                    if jeste_chybi:
                        fl_zaloha = float(wk_zakaznik.zaloha)
                        popis = (u'z sa.cz poptával %s Kč' % jeste_chybi
                                ) if (jeste_chybi>fl_zaloha) else ''
                        posleme_mu = min(jeste_chybi, fl_zaloha) 
                        id_pohybu = db.pohyb.insert(
                            idauth_user=wk_zakaznik.id,
                            idma_dati=Uc_sa.oz,
                            iddal=Uc_sa.oz_sa,
                            datum=datetime.now(),
                            castka=posleme_mu,
                            ss=symbol,
                            popis=popis
                            )
                        wk_zakaznik.update_record(zaloha=fl_zaloha-posleme_mu)
                        pohyb = db(db.pohyb.id==id_pohybu).select().first()
                        vypis1, sumplus1, summinus1 = __add_csv(
                                          pohyb, csv_nejpozdeji)
                        vypis += vypis1
                        sumplus += sumplus1
                        summinus += summinus1
                        #db.commit() - commit je v kontroléru csv.py
                        pocet += 1
    return pocet, vypis, sumplus, summinus
 def get_faculties(self):
     '''
     Retrieves the faculties from eclass.teilar.gr
     The output is dictionary with the following structure:
     faculties_from_eclass = {'url': ['name', 'code']}
     '''
     faculties_from_eclass = {}
     output = teilar_anon_login('http://openclass.teilar.gr/modules/auth/listfaculte.php')
     soup = BeautifulSoup(output)
     all_faculties = soup.table('td')
     for faculty in all_faculties:
         url = 'http://openclass.teilar.gr/modules/auth/' + faculty.a.get('href')
         name = faculty.a.contents[0].strip()
         code = faculty.small.contents[0].split(')')[0].replace('(', '').strip()
         faculties_from_eclass[url] = [name, code]
     return faculties_from_eclass
Пример #16
0
def get_MBB_tds(content_url):
    # now get the actual content
    r = s.post(content_url)

    # r.content spits out the content
    soup = BeautifulSoup(r.content)

    # make array of all "rows" (i.e. <tr> tags)
    alltrs = soup.table("tr")

    # get just the <td>...</td> content in an array
    alltds = []
    for row in alltrs:
        alltds.append(row.td)

    return alltds
Пример #17
0
def __get_zaloha(ss):
    '''zjistí zákazníkovu nedávnou zálohu parsováním uloženého zakaznici.html
    '''
    # duplicitní s export_csv.predej_dluzne, odkud jsem to oprásknul
    zaloha = fdate = None
    try:
        fname = os.path.join(request.folder, 'downloads', 'zakaznici.html')
        fdate = datetime.fromtimestamp(os.stat(fname).st_ctime)
        jirkovo = vfp.filetostr(fname)
        soup = BeautifulSoup(jirkovo)
        for zakaznik in soup.table('tr'):
            if str(zakaznik.td.a.string).strip().lstrip('0')==ss:
                zaloha = unformat_castka(zakaznik('td')[-4].string) 
                break
    except:
        pass
    return fdate, zaloha
Пример #18
0
def ajax():
    today = datetime.date.today()
    initial_day = datetime.date(2000, 1, 1)
    payload = {
        "__EVENTTARGET": "date_cal",
        "__EVENTARGUMENT": (today - initial_day).days,
        "classlist_ddl": request.args.get('building'),
        "__VIEWSTATE":
        "dDw1NTk0MzU4NjE7dDw7bDxpPDE+Oz47bDx0PDtsPGk8Mz47aTw0Pjs+O2w8dDxAMDw7Ozs7Ozs7Ozs7Pjs7Pjt0PDtsPGk8NT47PjtsPHQ8QDA8cDxwPGw8U0Q7PjtsPGw8U3lzdGVtLkRhdGVUaW1lLCBtc2NvcmxpYiwgVmVyc2lvbj0xLjAuNTAwMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPWI3N2E1YzU2MTkzNGUwODk8MjAxNy0xMi0wNT47Pjs+Pjs+Ozs7Ozs7Ozs7Oz47Oz47Pj47Pj47Pj47Pu5S1476NkYk5hmd81mL76xisA4B",
        "__VIEWSTATEGENERATOR": "D2C5BC33"
    }
    r = requests.post(query_url, data=payload)
    html_data = BeautifulSoup(r.text, "html5lib")
    table_data = {
        row("td", nowrap="nowrap")[0].text.strip():
        [cell.text.strip() for cell in row("td", nowrap="nowrap")][1:]
        for row in html_data.table("tr", nowrap="nowrap")
    }
    return jsonify(table_data)
Пример #19
0
def down_table(url):
    driver.get(url)
    source = driver.page_source
    soup = Soup(source,'lxml')
    table = Soup(str(soup.table()),'lxml')

    table_list = list(table.find_all('td'))
    final_list = list(map(lambda x: str(x).split(r'<')[int((len(str(x).split(r'<'))+1)/2)-1].split(r'>')[-1],table_list))
    
    col_even1 = soup.find_all('tr',class_ ="even")[0]
    judge_a = Soup(str(col_even1),'lxml')   
    judge_list = list(judge_a.find_all('td'))
    judge_list = list(map(lambda x: str(x).split(r'<')[int((len(str(x).split(r'<'))+1)/2)-1].split(r'>')[-1],judge_list))     
    if len(judge_list) != 19:
        pad  = ['--']
        dif = 19 - len(judge_list)
        final_list[19+len(judge_list):19+len(judge_list)] = pad*dif
    name = re.findall(r'<h2>.+?</h2>',source)[0].split(r'<')[-2].split(r'>')[1]
    book=Workbook()
    sheet1=book.active
    sheet1.title = "电影信息"
    sheet1.merge_cells('A1:S1')
    sheet1.cell(row = 1,column=1 ,value = name) 
    head = ['时间','网票','哈票','万达','金逸','淘电影','星美']
    for i in range(len(head)):
        if i == 0:
            sheet1.cell(row = 2,column=i+1,value = head[i]) 
        else:
            sheet1.cell(row = 2,column=3*i-1,value = head[i]) 
    sheet1.merge_cells('B2:D2')
    sheet1.merge_cells('E2:G2')
    sheet1.merge_cells('H2:J2')
    sheet1.merge_cells('K2:M2')
    sheet1.merge_cells('N2:P2')
    sheet1.merge_cells('Q2:S2')
    for i in range(int(len(final_list)/19)):
        for j in range(19):
            sheet1.cell(row = i+3,column=j+1 ,value = final_list[19*i+j]) 
    book.save(name + '.xlsx')  
    splitcri = u'每日票房数据统计'
    return name.split(splitcri)[0]    
Пример #20
0
def page_of_data(i): # normally the web-page with ten reports on it.
	page_no =str(i)
	url_base = 'http://mobile311.sfgov.org/'
	#url_ext = '?page='+page_no+'&service_id=518d5892601827e3880000c5' # street and sidewalk cleaning
	url_ext = '?page='+page_no+'&service_id=55e8409a45ff461f92000006' # homeless concerns
	# change this line for other type of service reports
	url= url_base+url_ext+'&status=open' #status closed is possible
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page.read(),'lxml')
	#get report numbers from first page
	reports = soup.table('span',"activity-timestamp") # using find_all => gives [ ]


	#get details from second page
	#should modify code to also get location information
	for line in reports:
		line=str(line)
		x=line.find("#")+1
		y=x+7
		z=line[x:y]
		
		#print z - the active report number
		url_goal = url_base+"reports/"+z
		#print url_goal

		page2 = urllib2.urlopen(url_goal)
		real_soup = BeautifulSoup(page2.read())
		#print real_soup # for debugging
		
		

		blockquote = real_soup('blockquote')
		for lne in blockquote:
			request_type = lne.find_next_sibling('p') 
			#print request_type
			if 'Encampment' in str(request_type):
				print url_goal
				#print blockquote
				#thefile.write("%s\n" % url_goal)
				with open("url_list.txt","a") as thefile:
					thefile.write("%s\n" % url_goal)
def get_single_item_data(item_url):
    global contract_count

    l_1 = []
    l_2 = []
    dict = {}

    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, features="html.parser")
    for item1 in soup.tbody.findAll('p'):
        l_1.append(item1.string)                                                # List of project numbers
    for item2 in soup.tbody('tr', class_='tr-hr-dashed'):
        i_1 = item2.find_all('td')[0].string
        i_2 = item2.find('span').string[3:]
        i_3 = item2.find_all('div')[1].contents
        if (i_2 == "Этап принят") and (len(i_3) == 1):
            l_2.append(i_1)                                                     # List of etap numbers

    for item_name in soup.table("a", {'class':'panel-some-doc preview'}):
        contract_count += 1                                                     # Additional task to count project files

    dict[l_1[1]] =l_2
    return dict
url_base = 'http://mobile311.sfgov.org/'
url_ext = '?external=false&service_id=518d5892601827e3880000c5' # street and sidewalk cleaning
url= 'http://mobile311.sfgov.org/?external=true&service_id=55e8409a45ff461f92000006&status=open'
url= url_base+url_ext+'&status=open'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),'lxml')


#print soup.table.tbody.tr

print ("------------")
#print soup.table.tbody.tr.span

#reports = soup.table('span',class_="activity-timestamp")

reports = soup.table('span',"activity-timestamp")
#reports = soup.table.find('span',"activity-timestamp").get_text()

for line in reports:
        line=str(line)
        x=line.find("#")+1
        y=x+7
        z=line[x:y]
        print z
        url_goal = url_base+"reports/"+z
        print url_goal
        page2 = urllib2.urlopen(url_goal)
        real_soup = BeautifulSoup(page2.read())
        blockquote = real_soup('blockquote')
        for line in blockquote:
                request_type = line.find_next_sibling('p')
from bs4 import BeautifulSoup
import os
import glob
import sys
from xlrd import open_workbook
from xlwt import Workbook
import xlsxwriter
workbook = xlsxwriter.Workbook('IT_2nd_sem_2nd.xlsx')									#NAME OF GENERATED FILE
worksheet = workbook.add_worksheet()

row = 1
for filename in glob.glob('*.html'):	
	soup = BeautifulSoup(open(filename),'html.parser')
	n=0
	c=0
	for b in soup.table():
		if(str(b.get('id'))!="None"):
			n=n+1
			x=str(b.get('id'))
	for b in soup.table():
		if(str(b.get('id'))!="None"):
			c=c+1
			if(c==n-1):
				x=str(b.get('id'))
				id_selector=x[3:5]
				print(id_selector)
	
	rollnumber = str(soup.find(id='lblRollNo').text)
	name = str(soup.find(id='lblFullName').text)
	fathername = str(soup.find(id='lblFatherName').text)
Пример #24
0
def scrape():
    # Scrape the NASA Mars News Site for the latest news title and paragraph
    url = 'https://mars.nasa.gov/news/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    result = soup.find('div', class_='content_title')
    result = soup.find('div', class_='features')
    news_title = result.find('div', class_="content_title").a.text
    news_p = result.find('div',
                         class_='rollover_description_inner').text.strip()
    news_url = result.find('div', class_="content_title").a['href']
    news_url = 'https://mars.nasa.gov' + news_url

    # Scrape the JPL's Featured Space Image
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    result = soup.find('a', class_='button fancybox')
    featured_image_url = 'https://www.jpl.nasa.gov' + result[
        'data-fancybox-href']

    # Scrape the Mars Weather twitter account
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    result = soup.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')
    mars_weather = result.text

    # Scrape the Mars Facts webpage
    url = 'https://space-facts.com/mars/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    facts = [td.text for td in soup.table('td')]
    mars_facts = []
    for i in range(0, len(facts), 2):
        mars_facts.append({facts[i]: facts[i + 1]})

    # Scrape the USGS Astrogeology site for high resolution images
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.find_all('a', class_='itemLink product-item')
    hemispher_image_urls = []
    for i in range(0, 8, 2):
        new_url = 'https://astrogeology.usgs.gov' + results[i]['href']
        browser.visit(new_url)
        html2 = browser.html
        soup2 = BeautifulSoup(html2, 'html.parser')
        hemisphere_image = soup2.find_all('a')
        hemisphere_image_url = hemisphere_image[41]['href']
        hemisphere_title = soup2.find('h2').contents[0]
        hemispher_image_urls.append({
            'title': hemisphere_title,
            'img_url': hemisphere_image_url
        })
    # Aggregate all the data above into a single dictionary and return to the caller
    mars_data = {
        'news_title': news_title,
        'news_p': news_p,
        'news_url': news_url,
        'featured_image': featured_image_url,
        'weather': mars_weather,
        'facts': mars_facts,
        'image1': hemispher_image_urls[0],
        'image2': hemispher_image_urls[1],
        'image3': hemispher_image_urls[2],
        'image4': hemispher_image_urls[3]
    }
    return mars_data
Пример #25
0
def unit_table_to_dict(
        data: str) -> Dict[str, Dict[str, Union[Dict[str, int], str, int]]]:
    """
    Parse HTML unit table from prismata.gamepedia.com into dict format.

    Parameters
    ----------
    data : str
        HTML table for unit list from prismata.gamepedia.com..

    Returns
    -------
    dict

    Example
    -------
    output:
        {
            "Unit Name":
                {
                    "name": "Unit Name",
                    "costs": {
                        "gold": 1,
                        "energy": 0,
                        "green": 1,
                        "blue": 0,
                        "red": 1,
                        },
                    "stats": {
                        "attack": 1,
                        "health": 1,
                        },
                    "attributes": {
                        "supply": 1,
                        "frontline": True,
                        "fragile": False,
                        "blocker": True,
                        "prompt": False,
                        "stamina": 0,
                        "lifespan": 0,
                        "build_time": 0,
                        "exhaust_turn": 0,
                        "exhaust_ability": 0,
                        },
                    "links": {
                        "path": "/Unit_Name",
                        },
                    "type": 1,
                    "unit_spell": "Unit|Spell",
                },
            ...
        }

    """
    soup = BeautifulSoup(data, "html.parser")
    table = soup.table("tr") if soup.table else []

    return {
        clean(unit[0]): {  # unit name
            "name": clean(unit[0]),
            "costs": {
                "gold": clean(unit[3], int),
                "energy": clean(unit[4], int),
                "green": clean(unit[5], int),
                "blue": clean(unit[6], int),
                "red": clean(unit[7], int),
                },
            "stats": {
                "attack": int(clean(unit[15]) or 0),
                "health": clean(unit[10], int),
                },
            "attributes": {
                "supply": clean(unit[8], int),
                "frontline": clean(unit[11], bool),
                "fragile": clean(unit[12], bool),
                "blocker": clean(unit[13], bool),
                "prompt": clean(unit[14], bool),
                "stamina": clean(unit[16], int),
                "lifespan": clean(unit[19], int),
                "build_time": clean(unit[9], int),
                "exhaust_turn": clean(unit[17], int),
                "exhaust_ability": clean(unit[18], int),
                },
            "links": {
                "path": unit[0].a.get("href"),
                },
            "type": clean(unit[1], int),
            "unit_spell": clean(unit[2]),
            }
        for unit in map(lambda row: row("td"), table)  # type: ignore
        if unit
        # Ignoring typing in map, all uses of the clean function return Any
        # This means that it can't match the return types for this function
        }
Пример #26
0
def scrape():
    browser = init_browser()
    # create mars_data dict that we can insert into mongo
    mars_data = {}



    # visit nasa for news of mars
    browser = Browser('chrome', headless=False)
    url_news = 'https://mars.nasa.gov/news/'
    browser.visit(url_news)

    # create a soup object from the html
    html_news = browser.html
    soup_news = BeautifulSoup(html_news, 'html.parser')

    div1 = soup_news.find('div', class_='content_title')
    news_title = div1.find('a').text
    news_p = soup_news.find('div', class_='article_teaser_body').text
    
    # add them into mars_data dict
    mars_data['news_title'] = news_title
    mars_data['news_p'] = news_p
    
    

    # visit JPL Mars space images to get a big image
    browser = Browser('chrome', headless=False)
    url_img = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_img)
    browser.click_link_by_partial_text('FULL IMAGE')

    # create a soup object from the html
    html_img = browser.html
    soup_img = BeautifulSoup(html_img, 'html.parser')

    home = soup_img.find('article', class_="carousel_item")
    link = home.a['data-fancybox-href']
    featured_image_url = 'https://www.jpl.nasa.gov' + link

    # add it into mars_data dict
    mars_data['featured_image_url'] = featured_image_url

    

    # visit twitter to get Mars Weather
    url_weather = 'https://twitter.com/marswxreport?lang=en'
    html_weather = requests.get(url_weather)
    soup_weather = BeautifulSoup(html_weather.text, 'html.parser')

    tweet = soup_weather.find('div', class_='stream')
    mars_weather = tweet.find(text="Mars Weather").findNext('p').text

    # add it into mars_data dict
    mars_data['mars_weather'] = mars_weather

    
    
    # visit Mars facts and create a table by pandas
    url_facts = 'https://space-facts.com/mars/'
    facts_table = pd.read_html(url_facts)
    df = facts_table[0]
    df.columns = ['Description', 'Value']
    df.set_index(['Description'], inplace = True)
    df.to_html('Mars_df.html')

    # Generate a html table from dataframe
    html_table = df.to_html()
    html_table.replace('\n','')

    soup_table = BeautifulSoup(open('mars_df.html'),'html.parser')

    # create a dictionaries for all cells to create a table in html
    mars_facts = {}
    mars_list = []
    ths = [x.text.strip(':') for x in soup_table.table('th') if x.text != '']
    column_list = ths[0:2]
    column_list.reverse()
    th = ths[2:]
    td = [y.text for y in soup_table.table('td')]
    mars_facts = dict([(i, j) for i, j in zip(th, td)])
    mars_list.append(mars_facts)

    # add them into mars_data dict
    mars_data['columns'] = column_list
    mars_data['mars_list'] = mars_list
    
    
    
    # get the hemisperes imgs
    url_hemisperes = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_hemisperes)

    html_hemisperes = browser.html
    soup_hem = BeautifulSoup(html_hemisperes, 'html.parser')
    jpg_links = soup_hem.find_all('div', class_='description')

    Mars_Hemisperes = []
    for link in jpg_links:
        info = {}
        h3 = link.find('h3').text
        info['title'] = h3
        browser.click_link_by_partial_text(h3)
        html2 = browser.html
        soup2 = BeautifulSoup(html2, 'html.parser')
        url = soup2.find('img', class_='wide-image')['src']
        info['img_url'] = 'https://astrogeology.usgs.gov' + url
        Mars_Hemisperes.append(info)
        browser.click_link_by_partial_text('Back')
    
    # add it into mars_data dict
    mars_data['Mars_Hemisperes'] = Mars_Hemisperes

    return mars_data
Пример #27
0
'''
https://kaijento.github.io/2017/03/30/beautifulboup-removing-tags/
'''

import csv, json, requests, sys
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_sector_composition'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')

writer = csv.writer(sys.stdout)
for tr in soup.table('tr')[2:]:
    for tag in tr(['span', 'sup']):
        tag.decompose()
    writer.writerow([td.text for td in tr('td')])
Пример #28
0
    def obtainWindSpeed(self, model, cities=False):
        """!
        function to get the api data app.deta.sh

        @param cities true or false value to get cities

        @return object json with the following structure
            example:
                [
                    {
                        "city": "London, United Kingdom", 
                        "wind_speed": "13 kph", 
                        "coordinates": { 
                        "w": "0.1278",
                        "n": "51.5074"
                        }
                    }, ...
                ]
        """
        response = requests.get(URL_API_WIND, verify=True)
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.table('tr')
        data = []
        for row in rows[1:]:
            column = row('td')
            td_elements = [c.text.replace('°', '') for c in column]
            coordinates = td_elements[0].split(',')
            coordinates = [
                "=".join(coordinate.strip().split(" ")[::-1]).lower()
                for coordinate in coordinates
            ]
            wind_speed = td_elements[1]
            dict_coordinates = self.coordinatesToJson(coordinates)

            if cities:
                response_cities = self.getCityName(coordinates)
                city = response_cities['result'] if response_cities.get(
                    'result', None) else None
            else:
                city = None
            object_model = model.objects.filter(city=city,
                                                wind_speed=wind_speed).last()
            if object_model:
                date_now = timezone.now()
                diff = relativedelta(date_now, object_model.date_register)
                if diff.hours >= 1:
                    object_model = model.objects.create(
                        city=city,
                        coordinates=dict_coordinates,
                        wind_speed=wind_speed)
            else:
                object_model = model.objects.create(
                    city=city,
                    coordinates=dict_coordinates,
                    wind_speed=wind_speed)
            comments = WeatherJournal.objects.filter(
                fk_weather=object_model.pk)
            data.append({
                'id':
                object_model.pk,
                'coordinates':
                dict_coordinates,
                'wind_speed':
                wind_speed,
                'city':
                city,
                'comments':
                WeatherJournalSerializer(comments, many=True).data
            })
            dict_coordinates = {}
        return json.dumps(data)