def parse1(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//div[@class="product_list_left_in"]//li', response_content) for seg in segs: ChemicalName,CASNumber,MolFormula,SearchImg,Synonyms,url = ['' for i in range(6)] SearchImg = crawlerTool.getXpath1('//div[@class="leftSearchImg"]/a/img/@src', seg) SearchImg = 'https://www.trc-canada.com' + SearchImg contents = crawlerTool.getXpath('//div[@class="ContentDesc"]', seg) for content in contents: content=content.replace('\r','').replace('\n','') if 'Chemical Name:' in content: ChemicalName = crawlerTool.getRegex('</label>(.*?)<',content).strip() elif 'CAS number:' in content: CASNumber = crawlerTool.getRegex('</label>(.*?)<', content).strip() elif 'Mol. Formula:' in content: MolFormula = crawlerTool.getRegex('</label>(.*?)<', content).strip() elif 'Synonyms' in content: Synonyms = crawlerTool.getRegex('</label>(.*?)<', content).strip() # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = Trc_Item() data_obj['ChemicalName'] = ChemicalName data_obj['CASNumber'] = CASNumber data_obj['MolFormula'] = MolFormula data_obj['SearchImg'] = SearchImg data_obj['Synonyms'] = Synonyms data_obj['api_name'] = cat_name data_obj['url'] = SearchImg yield data_obj
def parse(self, response): base_url = get_base_url(response) content = response.body # 乱码处理 for i in range(100): try: new_content = unicode(content, 'utf8') break except Exception, e: if 'position' in str(e): print str(e) error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e)) if '-' in str(e): start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 content = content[:start_index] + content[end_index:] else: start_index = int(crawlerTool.getRegex('position (\d+)',str(e))) content = content[:start_index] + content[start_index+1:]
def keyword_search(keyword): keywords = urllib.quote(keyword) url = 'https://www.youtube.com/results?search_query=' + keywords page = ct.get(url) imgurl0 = ct.getXpath('//div[@id="img-preload"]/img/@src', page)[0] vid = ct.getRegex('i.ytimg.com/vi/(.*?)/', imgurl0) video_url = 'https://www.youtube.com/watch?v=' + vid print video_url return video_url, imgurl0
def parse(self, response): base_url = get_base_url(response) response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多 cat_name = response.meta.get('cat_name') segs = crawlerTool.getXpath('//table[@id="product-list"]/tbody/tr', response_content) for seg in segs: name, MolecularFormula, MolecularWeight, image, cas, url = [ '' for i in range(6) ] SearchImg = crawlerTool.getXpath1( '//img[@class="dg-picture-zoom acc_img_container acc_zoomer"]/@src', seg) contents = crawlerTool.getXpath('//table//tr', seg) for content in contents: content = content.replace('\r', '').replace('\n', '') if 'Name' in content: name = crawlerTool.getXpath1('//td[2]', content) name = crawlerTool.getRegex('>(.*?)<', name).strip() elif 'CAS No' in content: cas = crawlerTool.getXpath1('//td[2]', content) cas = crawlerTool.getRegex('>(.*?)<', cas).strip() elif 'Molecular Formula' in content: MolecularFormula = crawlerTool.getXpath1( '//td[2]', content) MolecularFormula = re.sub('<.*?>', '', MolecularFormula).strip() elif 'Molecular Weight' in content: MolecularWeight = crawlerTool.getXpath1('//td[2]', content) MolecularWeight = crawlerTool.getRegex( '>(.*?)<', MolecularWeight).strip() # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg) data_obj = acccorporation_Item() data_obj['url'] = name data_obj['name'] = name data_obj['MolecularFormula'] = MolecularFormula data_obj['MolecularWeight'] = MolecularWeight data_obj['image'] = SearchImg data_obj['cas'] = cas yield data_obj
def parse(self,response): base_url = get_base_url(response) response_content = response.body # 乱码处理 url= response.url ''' url = scrapy.Field() IUPACname = scrapy.Field() CAS = scrapy.Field() Chemspaceid = scrapy.Field() Molformula = scrapy.Field() Molweight = scrapy.Field() ''' data_obj = ChemspaceItem() data_obj['url'] = url data_obj['IUPACname'] = crawlerTool.getXpath1('//div[@class="iupac-name"]//text()',response_content) data_obj['CAS'] = crawlerTool.getRegex('<dt>CAS</dt>[^<]?<dd>([\d-]+)</dd>',response_content) data_obj['Molformula'] = crawlerTool.getRegex('<dt>Mol formula</dt>[^<]?<dd>([\d\w]+)</dd>',response_content.replace('</sub>','').replace('<sub>','')) data_obj['Molweight'] = crawlerTool.getRegex('<dt>Mol weight</dt>[^<]?<dd>([\d\.]+)</dd>',response_content) print data_obj yield data_obj
def start_requests(self): sdf_dir = 'compounds' sdf_files = os.listdir(sdf_dir) for sdf_file in sdf_files: with open('compounds/'+sdf_file,'r') as fout: print 'sdf_file',sdf_file for line in fout: url = crawlerTool.getRegex('(https://chem-space.com/\w+)',line) if url and not self.db_connect.get_by_unique_value(url): # 由于内存不够会被kill掉! yield scrapy.Request(url, callback=self.parse)
def parser_sub(self,response): content = response.body # 乱码处理 for i in range(100): try: new_content = unicode(content, 'gbk') break except Exception, e: if 'position' in str(e): error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e)) start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 content = content[:start_index] + content[end_index:]
def extractor_info(video_url): page = ct.get(video_url) artist = ct.getRegex('歌手.*?[tT]ext":"(.*?)"', page) if not artist: artist = ct.getRegex('艺术家.*?[tT]ext":"(.*?)"', page) if not artist: artist = ct.getRegex('"Artist".*?[tT]ext":"(.*?)"', page) album = ct.getRegex('专辑.*?[tT]ext":"(.*?)"', page) label = ct.getRegex('由以下相关方许可给.*?[tT]ext":"(.*?)"', page) if not label: label = ct.getRegex('獲以下人士授權.*?[tT]ext":"(.*?)"', page) if not label: label = ct.getRegex('Licensed to YouTube.*?[tT]ext":"(.*?)"', page) song = ct.getRegex('"Song".*?[tT]ext":"(.*?)"', page) if not song: song = ct.getRegex('"歌曲".*?[tT]ext":"(.*?)"', page) title = ct.getRegex(',"title":"(.*?)"', page).replace('\\u0026', '&') title = re.sub(u"([/\\\\:*?<>|])", "", title) # 标题特殊符号过滤 print title return title, artist, album, label, song
def parser_detail(self, response): content = response.body url = response.url data_obj = ParkersItem() data_obj['title'] = crawlerTool.getXpath('//title/text()', content)[0] data_obj['url'] = url # url 中提取名称和model urlsplit = url.split('/') if len(urlsplit) > 4: data_obj['name'] = urlsplit[3] data_obj['model'] = urlsplit[4] data_obj['power'] = crawlerTool.getRegex('Power</th><td>(.*?)</td>', content) data_obj['TopSpeed'] = crawlerTool.getRegex( 'Top Speed</th><td>(.*?)</td>', content) data_obj['zerotosixty'] = crawlerTool.getRegex( '<th>0-60 mph</th><td>(.*?)</td>', content) data_obj['Torque'] = crawlerTool.getRegex( '<th>Torque</th><td>(.*?)</td>', content) data_obj['co2Emissions'] = crawlerTool.getRegex( '<th>CO<sub>2</sub> Emissions</th><td>(.*?)</td>', content) data_obj['EuroEmissionsStandard'] = crawlerTool.getRegex( '<th>Euro Emissions Standard</th><td>(.*?)</td>', content) data_obj['Fuelconsumption'] = crawlerTool.getRegex( '<tr><th>Fuel consumption</th><td>(.*?)</td>', content) data_obj['Length'] = crawlerTool.getRegex( '<tr><th>Length</th><td>(.*?)</td>', content) data_obj['Width'] = crawlerTool.getRegex( '<tr><th>Width</th><td>(.*?)</td>', content) data_obj['Height'] = crawlerTool.getRegex( '<tr><th>Height</th><td>(.*?)</td>', content) data_obj['EngineSize'] = crawlerTool.getRegex( '<tr><th>Engine Size</th><td>(.*?)</td>', content) data_obj['Cylinders'] = crawlerTool.getRegex( '<tr><th>Cylinders</th><td>(.*?)</td>', content) data_obj['FuelType'] = crawlerTool.getRegex( '<tr><th>Fuel Type</th><td>(.*?)</td>', content) data_obj['Transmission'] = crawlerTool.getRegex( '<tr><th>Transmission</th><td>(.*?)</td>', content) data_obj['Doors'] = crawlerTool.getRegex( '<tr><th>Doors</th><td>(.*?)</td>', content) data_obj['Seats'] = crawlerTool.getRegex( '<tr><th>Seats</th><td>(.*?)</td>', content) data_obj['taxcostBasic'] = crawlerTool.getRegex( '<tr><th>Monthly company car tax cost \(Basic Rate\)</th><td>(.*?)</td>', content).replace('£', '£') # £ 是英镑 # print lxr,dz,yb,dh,sj yield data_obj
sheet.write(row, 4, u'audio_link') sheet.write(row, 5, u'label') sheet.write(row, 6, u'keyword') with open('title_list.txt', 'r') as f: for line in f: row += 1 keyword = line.strip() # keyword = keyword.replace(' ','+') if not keyword: continue try: video_url, imgurl0 = keyword_search(keyword) title, artist, album, label, song = extractor_info(video_url) img_path = 'img/' + title + '.jpg' imgurl0 = ct.getRegex('(http.*?)\?', imgurl0) img_dl(imgurl0, img_path) except Exception, e: print e video_url, imgurl0, title, artist, album, label, song = '', '', '', '', '', '', keyword print(video_url, imgurl0, title, artist, album, label) sheet.write(row, 0, song) sheet.write(row, 1, artist) sheet.write(row, 2, title) sheet.write(row, 3, title + '.mp3') sheet.write(row, 4, video_url) sheet.write(row, 5, label) sheet.write(row, 6, keyword) wbk.save(filename) # 需要过滤广告
start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1 content = content[:start_index] + content[end_index:] response_content = new_content print response.url url= response.url gywm =crawlerTool.getXpath("//td[@class='goscill22']/table[2]//p/text()",response_content) # 关于我们 gywm = ''.join(gywm).replace('\n','').replace('\r','') # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了! lxwm = crawlerTool.getXpath("//td[@class='goscill22']/table[4]",response_content) # 联系我们 lxwm = lxwm[0] # lxwm = HTMLParser().unescape(lxwm) # lxwm=lxwm.encode('utf8') data_obj = HxchemItem() data_obj['url'] = url data_obj['gywm'] = gywm data_obj['name'] = crawlerTool.getXpath("//h1/text()",response_content)[0] data_obj['lxr'] = crawlerTool.getRegex('联系人:(.*?)<',lxwm) data_obj['dz'] = crawlerTool.getRegex('地 址:(.*?)<',lxwm) data_obj['yb'] = crawlerTool.getRegex('邮 编:(.*?)<',lxwm) data_obj['dh'] = crawlerTool.getRegex('电 话:(.*?)<',lxwm) data_obj['sj'] = crawlerTool.getRegex('手 机:(.*?)<',lxwm) data_obj['wz'] = crawlerTool.getRegex('网 址:<.*?>(.*?)<', lxwm) data_obj['dzyj'] = crawlerTool.getRegex('电子邮件:<.*?>(.*?)<', lxwm) # print lxr,dz,yb,dh,sj yield data_obj