def parse_proposal(e): try: global proposal_count print "start parse proposals #" + str(proposal_count) proposal_count += 1 g = Grab() g.response.body = etree.tostring(e) flights = [] departing = True for v in g.css_list('.info>.info>.clearfix'): flight = parse_flight(v, not departing) flights.extend(flight) departing = False print flights return { "total": re.search(total_propasal_price, etree.tostring( g.css('#paxAdtTd').find('..'))).group(1), "currency": "AUD", "main_airline": None if not flights else flights[0]["__main_airline"], "flights": flights } except Exception: return None
def parse_flight(e, route_leg): g0 = Grab() g0.response.body = etree.tostring(e) results = [] offset_days = 0 for f0 in g0.xpath_list('./li'): g = Grab() g.response.body = etree.tostring(f0) f = g.css_list('ul>li') def ff(index): return re.match(flight_field_pattern, etree.tostring(f[index])).group(1) h = re.match(origin_destination_pattern, g.css_text('h5')) def hh(index): return h.group(index) arrival_pattern_plus_day = re.compile('(\d+:\d+ [AP]M) \\+ (\d+) [dD]ay') if route_leg: base_date = request["return_date"] else: base_date = request["depart_date"] departure = ptime.get_full_date(str(base_date), ff(2)) departure += datetime.timedelta(days = offset_days) arrival_time = ff(3) arrival_date = ptime.get_date(base_date) arrival_plus_day = re.match(arrival_pattern_plus_day, arrival_time.strip()) if arrival_plus_day: offset_days += 1 arrival_time = arrival_plus_day.group(1) arrival_date += datetime.timedelta(days = offset_days) arrival = ptime.get_full_date(arrival_date, arrival_time) def sep_number(s): r = re.match(re.compile('([\w\d][\w\d])(\d+)'), str(s)) return r.group(1), r.group(2) airline, number = sep_number(ff(5)) results.append({ "number":number, "airline":airline, #ff(4), "origin":hh(1), "destination":hh(2), "departure":ptime.response_date(departure), "arrival":ptime.response_date(arrival), "duration":None, #ptime.str_timedelta(departure, arrival), "route_leg":str(int(route_leg)) , "aircraft":None, "__main_airline":airline, #ff(4) }) return results
def grab_team(uri): """ Функция из сайта sports.ru выдирает название команд и возварщает список из них """ teams = {} g = Grab() g.go(uri) try: get_teams = g.css_list('div.pageLayout div.contentLayout div.box div.layout-columns.second-page div.mainPart div.match-center div.tabs-container div ul li.panel.active-panel div.stat.mB6 table.stat-table.table tbody tr td.name-td.alLeft.bordR a') except IndexError: return teams for t in get_teams: teams.update({t.text: t.items()[0][1]}) return teams
def get_list(self): """@todo: Docstring for get_list. :returns: @todo """ g=Grab() g.go('http://hideme.ru/proxy-list/?country=AMAZBEBGCAHRCZDKEEFRGEDEITKZLVLUMDNLPLRORURSSECHTRUAGBUS&type=h&anon=234&code=989570157') #print g.xpath_text('//table') ips = g.css_list('td.tdl') imgs = g.xpath_list('//td/img') c = 0 srclist=[] for i in imgs: if c >= 5: srclist.append("http://hideme.ru"+i.attrib['src']) c+=1 iplist=[] for ip in ips: iplist.append(ip.text) d={} c=0 for i in range(len(iplist)): d[iplist[c]]=srclist[c] c+=1 for k,v in d.items(): subprocess.call('wget '+v+' -q -P '+tmp, shell=True) dt={} for k,v in d.items(): cmdline = 'convert '+tmp+v[24:]+" "+tmp+v[24:-4]+'.pnm' ocrline = ['ocrad', '--filter=numbers_only', tmp+v[24:-4]+'.pnm'] subprocess.call(cmdline, shell=True) unlink(tmp+v[24:]) process = subprocess.Popen(ocrline, stdout=subprocess.PIPE) out, err = process.communicate() dt[k]=out.replace('\n\n','') unlink(tmp+v[24:-4]+'.pnm') ## Save proxy list with open(root_path+'lists/proxy.list','w') as f: for k,v in dt.items(): f.write(k+":"+v+"\n")
def parse_proposal(e): try: global proposal_count print "start parse proposals #" + str(proposal_count) proposal_count += 1 g = Grab() g.response.body = etree.tostring(e) flights = [] departing = True for v in g.css_list('.info>.info>.clearfix'): flight = parse_flight(v, not departing) flights.extend(flight) departing = False print flights return { "total":re.search(total_propasal_price,etree.tostring(g.css('#paxAdtTd').find('..'))).group(1), "currency":"AUD", "main_airline":None if not flights else flights[0]["__main_airline"], "flights": flights } except Exception: return None
from grab import Grab import logging logging.basicConfig(level=logging.DEBUG) g = Grab() g.go('http://habrahabr.ru') g.xpath('//h2/a[@class="topic"]').get('href') print(g.xpath_text('//h2/a[@class="topic"]')) print(g.css_text('h2 a.topic')) print('Comments:', g.css_number('.comments .all')) from urllib.parse import urlsplit print(', '.join(urlsplit(x.get('href')).netloc for x in g.css_list('.hentry a') if not 'habrahabr.ru' in x.get('href') and x.get('href').startswith('http:')))
import django os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LawBot.settings") django.setup() from grab import Grab from core.models import Dictonary g = Grab() g.setup(hammer_mode=False) for letter in string.ascii_lowercase: for page in range(10000): g.go('http://thelawdictionary.org/letter/%s/page/%d/' % (letter, page)) posts = g.css_list('.post') if len(posts) == 0: print('end of letter: %s' % letter) break for post in posts: try: title = post.cssselect('h2.title a')[0].get('title') desc = post.cssselect('article p')[0].text Dictonary.objects.create(title=title, description=desc) print(title) except: print('error') print('---------------\nletter %s page %d complete' % (letter, page)) # total = 0 # total_latlan_none = 0
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual( self.lxml_tree.xpath('//div[@id="bee"]/div') [0].text_content().strip(), u'пчела') self.assertEqual( self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath( '//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual( 'num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual( '1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')
from grab import Grab from ipdb import set_trace def clearStr( string ): return string.replace('\n','').replace('\t','').replace('\r','') g = Grab() g.go('https://id.pr-cy.ru/signup/login/') g.doc.set_input('login_email','*****@*****.**') g.doc.set_input('password','biksileev') g.doc.submit() g.go('https://a.pr-cy.ru/personabook.ru') newList = g.css_list('.is') i = 0 f = open('prcy.html','w') f.write(''' <html> <head> <meta charset="utf-8" /> </head> <body> ''') f.write('<table>') for name in newList: try:
def parse_flight(e, route_leg): g0 = Grab() g0.response.body = etree.tostring(e) results = [] offset_days = 0 for f0 in g0.xpath_list('./li'): g = Grab() g.response.body = etree.tostring(f0) f = g.css_list('ul>li') def ff(index): return re.match(flight_field_pattern, etree.tostring(f[index])).group(1) h = re.match(origin_destination_pattern, g.css_text('h5')) def hh(index): return h.group(index) arrival_pattern_plus_day = re.compile( '(\d+:\d+ [AP]M) \\+ (\d+) [dD]ay') if route_leg: base_date = request["return_date"] else: base_date = request["depart_date"] departure = ptime.get_full_date(str(base_date), ff(2)) departure += datetime.timedelta(days=offset_days) arrival_time = ff(3) arrival_date = ptime.get_date(base_date) arrival_plus_day = re.match(arrival_pattern_plus_day, arrival_time.strip()) if arrival_plus_day: offset_days += 1 arrival_time = arrival_plus_day.group(1) arrival_date += datetime.timedelta(days=offset_days) arrival = ptime.get_full_date(arrival_date, arrival_time) def sep_number(s): r = re.match(re.compile('([\w\d][\w\d])(\d+)'), str(s)) return r.group(1), r.group(2) airline, number = sep_number(ff(5)) results.append({ "number": number, "airline": airline, #ff(4), "origin": hh(1), "destination": hh(2), "departure": ptime.response_date(departure), "arrival": ptime.response_date(arrival), "duration": None, #ptime.str_timedelta(departure, arrival), "route_leg": str(int(route_leg)), "aircraft": None, "__main_airline": airline, #ff(4) }) return results
def page_results(request, content): g = Grab() g.response.body = content print "start parsing..." # Example: <li><strong>Departure Time</strong> - 8:55 PM</li> flight_field_pattern = re.compile( '<li><strong>[^<]+</strong> - ([^<]+)</li>') # Example: Sydney, Nsw (SYD) to Kuala Lumpur (KUL) May 8, 2012 origin_destination_pattern = re.compile( '[^\(]+ \(([^\)]+)\) [^\(]+ \(([^\)]+)\).*') # Example: <td id="paxAdtTd">1</td><td>$1296.87</td> total_propasal_price = re.compile( '<td id="paxAdtTd">[^<]*</td><td>\\$([^<]+)</td>') # operation_airline = main airline, airline = airline def parse_flight(e, route_leg): g0 = Grab() g0.response.body = etree.tostring(e) results = [] offset_days = 0 for f0 in g0.xpath_list('./li'): g = Grab() g.response.body = etree.tostring(f0) f = g.css_list('ul>li') def ff(index): return re.match(flight_field_pattern, etree.tostring(f[index])).group(1) h = re.match(origin_destination_pattern, g.css_text('h5')) def hh(index): return h.group(index) arrival_pattern_plus_day = re.compile( '(\d+:\d+ [AP]M) \\+ (\d+) [dD]ay') if route_leg: base_date = request["return_date"] else: base_date = request["depart_date"] departure = ptime.get_full_date(str(base_date), ff(2)) departure += datetime.timedelta(days=offset_days) arrival_time = ff(3) arrival_date = ptime.get_date(base_date) arrival_plus_day = re.match(arrival_pattern_plus_day, arrival_time.strip()) if arrival_plus_day: offset_days += 1 arrival_time = arrival_plus_day.group(1) arrival_date += datetime.timedelta(days=offset_days) arrival = ptime.get_full_date(arrival_date, arrival_time) def sep_number(s): r = re.match(re.compile('([\w\d][\w\d])(\d+)'), str(s)) return r.group(1), r.group(2) airline, number = sep_number(ff(5)) results.append({ "number": number, "airline": airline, #ff(4), "origin": hh(1), "destination": hh(2), "departure": ptime.response_date(departure), "arrival": ptime.response_date(arrival), "duration": None, #ptime.str_timedelta(departure, arrival), "route_leg": str(int(route_leg)), "aircraft": None, "__main_airline": airline, #ff(4) }) return results def parse_proposal(e): try: global proposal_count print "start parse proposals #" + str(proposal_count) proposal_count += 1 g = Grab() g.response.body = etree.tostring(e) flights = [] departing = True for v in g.css_list('.info>.info>.clearfix'): flight = parse_flight(v, not departing) flights.extend(flight) departing = False print flights return { "total": re.search(total_propasal_price, etree.tostring( g.css('#paxAdtTd').find('..'))).group(1), "currency": "AUD", "main_airline": None if not flights else flights[0]["__main_airline"], "flights": flights } except Exception: return None list = g.css_list('.result-list>li') results = [] for e in list: proposal = parse_proposal(e) if proposal: results.append(proposal) return results
def grabPRCY(fileAddr): from grab import Grab def clearStr( string ): if type(string) != type(None): return string.replace('\n','').replace('\t','').replace('\r','') g = Grab() g.go('https://id.pr-cy.ru/signup/login/') g.doc.set_input('login_email','*****@*****.**') g.doc.set_input('password','biksileev') g.doc.submit() output = open('Finished.txt', 'w') j = 1 phant = webdriver.PhantomJS() for string in fileinput.input(fileAddr): customerList = string.split(' ') customerList[2] = clearStr(customerList[2]) phant.get('https://a.pr-cy.ru/' + customerList[1]) time.sleep(60) g.go('https://a.pr-cy.ru/' + customerList[1]) newList = g.css_list('.is') print(len(newList)) i = 0 f = open('audit/' + customerList[1] + '.html','w') f.write('''<!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <link rel='stylesheet' href="style.css"> </head> <body> <div id="head"> <img src="biksileev.jpg"/> <h1>Технический аудит сайта http://''' + customerList[1] + '''</h1> <p>Для чёткого понимания текущего технического состояния сайта http://''' + customerList[1] + ''' был проведён полный технический аудит, результаты которого представлены ниже в виде таблицы.</p></div>''') f.write('<div>') f.write('<table>') f.write('<thead><tr><td colspan="2">Технический аудит</td></tr></thead>') f.write('<tbody>') f.write('<tr><td>Критерий</td><td>Текущее состояние</td></tr>') for name in newList: if True: #not('Обратные ссылки' in name.cssselect('.info-test')[0].text) or not('Аналитика' in name.cssselect('.info-test')[0].text): if len(name.cssselect('.info-test')) > 0: print(name.cssselect('.info-test')[0].text) if (('Описание страницы' or 'Скриншот сайта на смартфоне') in name.cssselect('.info-test')[0].text): f.write('</table></div><div class="pageBreak"><table>') f.write('<tr ><td class="left">') else: f.write('<tr><td class="left">') f.write(name.cssselect('.info-test')[0].text) f.write('</td>') f.write(' ') if len(name.cssselect('.content-test')) > 0: if (len(clearStr(name.cssselect('.content-test')[0].text)) > 0): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') if (len(name.cssselect('.content-test')[0].cssselect('a')) > 0): f.write(clearStr(name.cssselect('.content-test')[0].text) + clearStr(name.cssselect('.content-test')[0].cssselect('a')[0].text)) else: f.write(clearStr(name.cssselect('.content-test')[0].text)) f.write('</td>') elif (len(name.cssselect('.content-test')[0].cssselect('.iphone .iphone-screen img')) > 0): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') f.write('<img src="http://' + name.cssselect('.content-test')[0].cssselect('.iphone .iphone-screen img')[0].get('src')[2:] + '">') f.write('</td>') elif(('Facebook' in name.cssselect('.info-test')[0].text) or ('ВКонтакте' in name.cssselect('.info-test')[0].text) or ('Google+' in name.cssselect('.info-test')[0].text) or ('Twitter' in name.cssselect('.info-test')[0].text)): if(name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') f.write('Ссылка на страницу найдена.') f.write('</td>') elif(name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') f.write('Ссылка на страницу не найдена.') f.write('</td>') elif ((len(name.cssselect('.content-test')[0].cssselect('a')) > 0)): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') f.write(clearStr(name.cssselect('.content-test')[0].cssselect('a')[0].text)) f.write('</td>') elif (len(name.cssselect('.content-test')[0].cssselect('p')) > 0): newList2 = name.cssselect('.content-test')[0].cssselect('p') if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') for paragraph in newList2: f.write(clearStr(paragraph.text)) f.write('<br>') f.write('</td>') elif (len(name.cssselect('.content-test')[0].cssselect('.progress-info .progress-info')) > 0): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') f.write(clearStr(name.cssselect('.content-test')[0].cssselect('.progress-info .progress-info')[0].text)) f.write('</td>') elif (len(name.cssselect('.content-test')[0].cssselect('.progress-info')) > 0): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') f.write(clearStr(name.cssselect('.content-test')[0].cssselect('.progress-info')[0].text)) f.write('</td>') elif (len(name.cssselect('.content-test')[0].cssselect('span')) > 0) or ('Системы статистики' in name.cssselect('.info-test')[0].text): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') newList2 = name.cssselect('.content-test')[0].cssselect('span') for analytics in newList2: f.write(clearStr(analytics.text)) f.write('<br>') f.write('</td>') elif (len(name.cssselect('.info-test')) > 0): if('Местоположение сервера' in name.cssselect('.info-test')[0].text): if (name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') elif (name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right unsuccess">') else: f.write('<td class="right">') f.write(name.cssselect('.content-test img')[0].get('alt').split(' ')[2]) f.write('</td>') elif('Favicon' in name.cssselect('.info-test')[0].text): if(name.cssselect('.check-test')[0].get('test-status') == 'success'): f.write('<td class="right success">') f.write('Отлично, у сайта есть Favicon.') f.write('</td>') elif(name.cssselect('.check-test')[0].get('test-status') == 'fail'): f.write('<td class="right success">') f.write('Отлично, у сайта есть Favicon.') f.write('</td>') i += 1 '''f.write('<td>') newList3 = name.cssselect('.description p') for paragraph in newList3: f.write(paragraph.text)''' f.write('</tbody>') f.write('</table>') f.write('''<p> Резолюция Сайт частично оптимизирован.</p> </body> </html> ''') f.close() file = HTML(filename="audit/" + customerList[1] + ".html") file.render().write_pdf(target="audit/" + customerList[1] + ".pdf") #file.render('file://' + os.getcwd() + '/audit/' + customerList[1] + '.html', 'audit/' + customerList[1] + '.pdf') subject = customerList[0] + ' - подготовили аудит вашего сайта: ' + customerList[1] message = customerList[0] + """, добрый день! Причина нашего обращения к Вам не случайна. Специалистами студии Дмитрия Биксилеева в течение марта месяца проводился выборочный аудит сайтов компаний работающих в сфере услуг для бизнеса. В том числе был проведен краткий аудит Вашего сайта %s Нашими SEO-специалистами выявлены достаточно серьезные ошибки на сайте, мешающие его продвижению в поисковых системах и снижающие удобство пользования вашим сайтом для ваших потенциальных клиентов (см. приложение «Экспресс аудит сайта»). Как правило, данные ошибки не заметны на первый взгляд, но об их наличии убедительно свидетельствует низкий КПД сайта. Наверное, и Вы сами, как ответственный и экономный хозяин, периодически задаетесь вопросом: Почему сайт, в который вложено столько интеллектуальных и финансовых ресурсов не оправдывает свое существование? Почему клиенты заходят на сайт, но не совершают покупок? Почему Ваши конкуренты уводят клиентов? Мы дадим ответы на все интересующие Вас вопросы и с удовольствием поделимся самыми свежими и самыми необходимыми в XXI веке знаниями по интернет-маркетингу. В случае Вашей заинтересованности, сделаем полный базовый, технический и юзабилити аудит сайта, предложим реальные сроки и способы устранения недостатков и выведем Ваш сайт на лидирующие позиции в поисковиках по самым высоко конверсионным запросам. Мы не предлагаем Вам услуги с непредсказуемым или неубедительным результатом. Мы предлагаем взрывной рост Вашему Интернет-бизнесу! Помогая Вам в бизнесе, мы становимся своеобразным хуком в интернет-продажах, Вашим директором по маркетингу, полностью выстраивающим маркетинг и систему продаж. С уважением к Вам и Вашему бизнесу, Бубновский Михаил Директор по развитию компании Студия Дмитрия Биксилеева ---------------------------------------------------------- Тел.: +7(343)298-03-54 Сот. Тел.: +7 (922)1554515 E-mail: [email protected] skype: ottepel_1 www.biksileev.ru""" % customerList[1] #sendMail(customerList[2], subject, message, 'audit/' + customerList[1] + '.pdf') customerList.append('Отправлено') output.write(' '.join(customerList)) output.write('\n') text1.delete('1.0', str(len(' '.join(customerList) + '\n') + 1) + '.0') text1.insert(str(j) + '.0', ' '.join(customerList) + '\n') text1.update() output.close() phant.quit()
class LXMLExtensionTest(TestCase): def setUp(self): SERVER.reset() # Create fake grab instance with fake response self.g = Grab(transport=GRAB_TRANSPORT) self.g.fake_response(HTML, charset='cp1251') from lxml.html import fromstring self.lxml_tree = fromstring(self.g.response.body) def test_lxml_text_content_fail(self): # lxml node text_content() method do not put spaces between text # content of adjacent XML nodes self.assertEqual(self.lxml_tree.xpath('//div[@id="bee"]/div')[0].text_content().strip(), u'пчела') self.assertEqual(self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха') def test_lxml_xpath(self): names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*')) self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names) names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*[name() != "script" and name() != "style"]')) self.assertEqual(set(['em', 'div', 'strong']), names) def test_xpath(self): self.assertEqual('bee-em', self.g.xpath_one('//em').get('id')) self.assertEqual('num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_one('//em[@id="baz"]')) self.assertEqual(None, self.g.xpath_one('//zzz', default=None)) self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo')) def test_xpath_text(self): self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True)) self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code')) self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id')) self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//*[@id="bee2"]/@id')) def test_xpath_number(self): self.assertEqual(100, self.g.xpath_number('//li')) self.assertEqual(100, self.g.xpath_number('//li', make_int=True)) self.assertEqual('100', self.g.xpath_number('//li', make_int=False)) self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True)) self.assertEqual('1002', self.g.xpath_number('//li', ignore_spaces=True, make_int=False)) self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza')) self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo')) def test_xpath_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.xpath_list('//li')]) def test_css(self): self.assertEqual('bee-em', self.g.css_one('em').get('id')) self.assertEqual('num-2', self.g.css_one('#num-2').get('id')) self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz')) self.assertEqual('foo', self.g.css_one('zzz', default='foo')) def test_css_text(self): self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True)) self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True)) self.assertRaises(DataNotFound, lambda: self.g.css_text('code')) self.assertEqual('foo', self.g.css_text('zzz', default='foo')) def test_css_number(self): self.assertEqual(100, self.g.css_number('li')) self.assertEqual('100', self.g.css_number('li', make_int=False)) self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True)) self.assertRaises(DataNotFound, lambda: self.g.css_number('liza')) self.assertEqual('foo', self.g.css_number('zzz', default='foo')) def test_css_list(self): self.assertEqual(['num-1', 'num-2'], [x.get('id') for x in self.g.css_list('li')]) def test_strip_tags(self): self.assertEqual('foo', self.g.strip_tags('<b>foo</b>')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar')) self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar')) self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True)) self.assertEqual('', self.g.strip_tags('<b> <div>')) def test_css_exists(self): self.assertTrue(self.g.css_exists('li#num-1')) self.assertFalse(self.g.css_exists('li#num-3')) def test_xpath_exists(self): self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]')) self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]')) def test_cdata_issue(self): g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) # By default HTML DOM builder is used # It handles CDATA incorrectly self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual(None, g.tree.xpath('//weight')[0].text) # But XML DOM builder produces valid result #self.assertEqual(None, g.xpath_one('//weight').text) self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text) # Use `content_type` option to change default DOM builder g = Grab(transport=GRAB_TRANSPORT) g.fake_response(XML) g.setup(content_type='xml') self.assertEqual('30', g.xpath_one('//weight').text) self.assertEqual('30', g.tree.xpath('//weight')[0].text) def test_xml_declaration(self): """ HTML with XML declaration shuld be processed without errors. """ SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?> <html><body><h1>test</h1></body></html> """ g = Grab() g.go(SERVER.BASE_URL) self.assertEqual('test', g.xpath_text('//h1')) def test_empty_document(self): SERVER.RESPONSE['get'] = 'oops' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag') SERVER.RESPONSE['get'] = '<frameset></frameset>' g = Grab() g.go(SERVER.BASE_URL) g.xpath_exists('//anytag')
def id_for_answer(answer): """возвращает числовой идентификатор варианта ответа""" body = G.response.body ai = body.find(answer) fr = body[ai - 50: ai] id = fr[fr.find('PDI_answer') + 10:-2] return id def vote(c, l, j, g, o, p, k, m, h, f): """голосуем...""" d = id_for_answer(POSITION) url = 'http://polldaddy.com/vote.php?va={}&pt={}&r={}&p={}&a={}&o=&t={}&token={}'.format(o, g, j, c, d, p, f) G.go(url) try: print G.css_text('.poll-msg'), except: print 'No msg', print G.css_list('.votes')[POSITION_POS - 1].text #запускаем голосовалку for i in range(0, VOTES): G.clear_cookies() G.go('http://polldaddy.com/poll/6061575/') vote_call = G.css_list('.button-lrg')[0].attrib['onclick'] vote_call_args = vote_call[vote_call.find('(') + 1:vote_call.find(')')] vote_call_args = vote_call_args.strip().replace("'", '').split(',') vote(*vote_call_args) sleep(2)
def page_results(request, content): g = Grab() g.response.body = content print "start parsing..." # Example: <li><strong>Departure Time</strong> - 8:55 PM</li> flight_field_pattern = re.compile('<li><strong>[^<]+</strong> - ([^<]+)</li>') # Example: Sydney, Nsw (SYD) to Kuala Lumpur (KUL) May 8, 2012 origin_destination_pattern = re.compile('[^\(]+ \(([^\)]+)\) [^\(]+ \(([^\)]+)\).*') # Example: <td id="paxAdtTd">1</td><td>$1296.87</td> total_propasal_price = re.compile('<td id="paxAdtTd">[^<]*</td><td>\\$([^<]+)</td>') # operation_airline = main airline, airline = airline def parse_flight(e, route_leg): g0 = Grab() g0.response.body = etree.tostring(e) results = [] offset_days = 0 for f0 in g0.xpath_list('./li'): g = Grab() g.response.body = etree.tostring(f0) f = g.css_list('ul>li') def ff(index): return re.match(flight_field_pattern, etree.tostring(f[index])).group(1) h = re.match(origin_destination_pattern, g.css_text('h5')) def hh(index): return h.group(index) arrival_pattern_plus_day = re.compile('(\d+:\d+ [AP]M) \\+ (\d+) [dD]ay') if route_leg: base_date = request["return_date"] else: base_date = request["depart_date"] departure = ptime.get_full_date(str(base_date), ff(2)) departure += datetime.timedelta(days = offset_days) arrival_time = ff(3) arrival_date = ptime.get_date(base_date) arrival_plus_day = re.match(arrival_pattern_plus_day, arrival_time.strip()) if arrival_plus_day: offset_days += 1 arrival_time = arrival_plus_day.group(1) arrival_date += datetime.timedelta(days = offset_days) arrival = ptime.get_full_date(arrival_date, arrival_time) def sep_number(s): r = re.match(re.compile('([\w\d][\w\d])(\d+)'), str(s)) return r.group(1), r.group(2) airline, number = sep_number(ff(5)) results.append({ "number":number, "airline":airline, #ff(4), "origin":hh(1), "destination":hh(2), "departure":ptime.response_date(departure), "arrival":ptime.response_date(arrival), "duration":None, #ptime.str_timedelta(departure, arrival), "route_leg":str(int(route_leg)) , "aircraft":None, "__main_airline":airline, #ff(4) }) return results def parse_proposal(e): try: global proposal_count print "start parse proposals #" + str(proposal_count) proposal_count += 1 g = Grab() g.response.body = etree.tostring(e) flights = [] departing = True for v in g.css_list('.info>.info>.clearfix'): flight = parse_flight(v, not departing) flights.extend(flight) departing = False print flights return { "total":re.search(total_propasal_price,etree.tostring(g.css('#paxAdtTd').find('..'))).group(1), "currency":"AUD", "main_airline":None if not flights else flights[0]["__main_airline"], "flights": flights } except Exception: return None list = g.css_list('.result-list>li') results = [] for e in list: proposal = parse_proposal(e) if proposal: results.append(proposal) return results