def parse_detail(self, response): category = response.css('.meta .category a::text').get() or '' name = helpers.fix_title(response.css('h2::text').get() or '') slug = helpers.get_slug(name) address = response.css('.node p::text').get() or '' city = response.css('.meta .tags a::text').get() or '' phone = response.css( '.field-field-telepon .field-item::text').get() or '' fax = response.css('.field-field-fax .field-item::text').get() or '' email = response.css( '.field-field-email .field-item::text').get() or '' website = response.css( '.field-field-website .field-item::text').get() or '' broker = response.css( '.field-field-broker .field-item::text').get() or '' npwp = response.css('.field-field-npwp .field-item::text').get( ).replace('NPWP', '').strip('\n :') or '' description = '' url = response.url or '' image_name = '' # if len(email) == 0: # self.logger.info('{} : EMPTY EMAIL'.format(url)) # if len(phone) == 0: # self.logger.info('{} : EMPTY PHONE'.format(url)) if self.name in website: website = '' # if len(email) > 0 and len(phone) > 0: image_url = response.css('img::attr(src)').get() if image_url is not None: image_url = image_url.strip() ext = image_url.split('.')[-1] image_name = slug target_dir = 'images/{}/{}.{}'.format(self.name, image_name, ext) self.logger.info('downloading image: {} => {}'.format( image_url, target_dir)) r = helpers.download(image_url, target_dir) if not r: self.logger.info('Failed download {} => {}'.format( image_url, target_dir)) yield { 'category': category.strip(), 'name': name.strip(), 'slug': slug.strip(), 'address': address.strip(), 'city': city.strip(), 'phone': phone.strip(), 'fax': fax.strip(), 'email': email.strip(), 'website': website.strip(), 'broker': broker.strip(), 'npwp': npwp.strip(), 'description': description.strip(), 'url': url.strip(), 'image_name': image_name.strip(), }
for k, v in data.items(): v = remove_unicode(v) v = v.replace(' ', ' ').replace(' ', ' ').replace(' ', ' ') data[k] = v return data print('Load done data...') done = {} with open(file_reputasi, 'r', encoding='utf8') as f: for row in f.read().strip().split('\n'): row = json.loads(row) # print(helpers.fix_title(row['name'])) # done[helpers.get_slug(helpers.fix_title(row['name']), '', True)] = row['url'] done[helpers.get_slug(helpers.fix_title(row['slug']), '', True)] = row['url'] print('{} done data loaded'.format(len(done))) print('Load perusahaan data...') perusahaan = [] skipped_counter = { 'done': 0, 'empty_name': 0, 'empty_address': 0, 'empty_phone': 0, 'empty_email': 0, 'invalid_phone': 0, 'invalid_email': 0, } with open(file_source, 'r') as f:
'website': 0, } done_slug = [] done_email = [] done_phone = [] done_website = [] clean = [] print("INFO: start cleaning...") for row in data: # print(row[COL_NAME]) row = clean_data(row) category = row[COL_CATEGORY] # sc = category.lower() # if sc not in categories: # categories.append(sc) name = helpers.fix_title(row[COL_NAME]) slug = helpers.get_slug(name) email = row[COL_EMAIL] phone = row[COL_PHONE] website = row[COL_WEBSITE] city = row[COL_CITY] if len(city) == 0: city = row[COL_ADDRESS].strip().split(' ')[-1].strip() #.lower() row[COL_CITY] = city if slug in done_slug: duplicate['slug'] += 1 print('INFO: dp slug => {}'.format(slug)) continue if email in done_email: duplicate['email'] += 1 print('INFO: dp email => {}'.format(email))
def parse_detail(self, response): category = response.css('ol.breadcrumb.pull-left > li > a')[-1].css('::text').get() or '' name = response.css('h1.business-title span::text').get() or '' address = [] city = response.css('span[itemprop=addressLocality]::text').get() or '' phone = response.css('span[itemprop=telephone]::text').get() or '' email = '' website = response.css('ul.dropdown-menu > li > a[itemprop=url]::attr(href)').get() or '' description = [] url = response.url or '' # email try: cfemail = response.css('span.__cf_email__::attr(data-cfemail)').get() or '' if len(cfemail) > 0: email = helpers.cfDecodeEmail(cfemail) except: email = '' # address address_1 = response.css('h4 > span > span::text') address_2 = response.css('h4 > span::text') for index, a1 in enumerate(address_1): a1 = a1.get().strip() a2 = address_2[index].get().strip() address.append(a1) address.append(a2) address = ' '.join(address) address = address.replace(' ,', ',') # description for txt in response.css('.col-sm-12 > p p'): d = txt.css('::text').get() or '' description.append(d.strip()) description = '. '.join(description) description = description.replace('..', '.') description = description.replace('. . ', '. ') description = description.replace('. . ', '. ') if len(email) == 0: self.logger.info('{} : EMPTY EMAIL'.format(url)) if len(phone) == 0: self.logger.info('{} : EMPTY PHONE'.format(url)) if len(email) > 0 and len(phone) > 0: image_url = response.css('.detail-listing-img > img::attr(src)').get() if image_url is not None and image_url[-1] != '/': image_url = image_url.strip() ext = image_url.split('.')[-1] image_name = helpers.get_slug(helpers.fix_title(name)) target_dir = 'images/{}/{}'.format(self.name, image_name) self.logger.info('downloading image: {} => {}'.format(image_url, target_dir)) helpers.download(image_url, target_dir) yield { 'category': category.strip(), 'name': name.strip(), 'address': address.strip(), 'city': city.strip(), 'phone': phone.strip(), 'email': email.strip(), 'website': website.strip(), 'description': description.strip(), 'url': url.strip(), }
def parse_detail(self, response): category = '' name = '' address = '' city = '' phone = '' fax = '' email = '' website = '' description = '' url = response.url or '' image_url = '' # check type lis = response.css('.comp-body li') trs = response.css('table.table.description tr') if len(lis) > 0: # type 1 for li in lis: k = li.css('::text').get().strip().split(':')[0].strip() v = li.css('::text').get().strip().split(':')[-1].strip() if len(k) == 0: continue if 'Company Name' in k: name = v elif 'Address' in k: address = v elif 'Telephone' in k: phone = li.css('a::text').get() elif 'Fax' in k: fax = v elif 'Email' in k: email = li.css('a::text').get() # description description = [] for p in response.css('.comp-row > p::text'): txt = p.get().strip() if len(txt) == 0 or 'Description' in txt: continue description.append(txt) description = ' '.join(description) # website website = response.css('.comp-row > p > a::attr(href)').get() or '' if self.name in website: website = '' # category category = response.css('.title-comp .col-sm-10::text')[-1].get() # image_url image_url = response.css('.img-container img::attr(src)').get() or '' elif len(trs) > 0: # type 2 for tr in trs: k = tr.css('td::text')[0].get() v = tr.css('td::text')[-1].get() if len(k) == 0: continue if 'Nama Perusahaan' in k: name = v elif 'Alamat' in k: address = tr.css('td')[-1].css('p::text').get() elif 'Kategori' in k: category = v elif 'Telepon' in k: phone = tr.css('td')[-1].css('a::text').get() elif 'Fax' in k: fax = tr.css('td')[-1].css('a::text').get() elif 'Email' in k: email = tr.css('td')[-1].css('a::text').get() # description description = [] for p in response.css('.container > p::text'): txt = p.get().strip() if len(txt) == 0: continue description.append(txt) description = ' '.join(description) # website website = response.css('a.btn.btn-contactus.btn-go-to::attr(href)').get() or '' if self.name in website: website = '' # image_url image_url = response.css('img.center-img::attr(src)').get() or '' if email is None or len(email) == 0: self.logger.info('{} : EMPTY EMAIL'.format(url)) email = '' if phone is None or len(phone) == 0: self.logger.info('{} : EMPTY PHONE'.format(url)) phone = '' # if len(email) > 0 and len(phone) > 0: name = helpers.fix_title(name) slug = helpers.get_slug(name) if image_url is not None and len(image_url) > 0: image_url = image_url.strip() ext = image_url.split('.')[-1] image_name = slug target_dir = 'images/{}/{}.{}'.format(self.name, image_name, ext) self.logger.info('downloading image: {} => {}'.format(image_url, target_dir)) r = helpers.download(image_url, target_dir) if not r: self.logger.info('Failed download {} => {}'.format(image_url, target_dir)) yield { 'category': category.strip(), 'name': name.strip(), 'slug': slug.strip(), 'address': address.strip(), 'city': city.strip(), 'phone': phone.strip(), 'email': email.strip(), 'website': website.strip(), 'description': description.strip(), 'url': url.strip(), }
def parse_detail(self, response): category = response.css('.breadcrumb li')[-2].css('::text').get() or '' name = helpers.fix_title( response.css('.breadcrumb li')[-1].css('::text').get() or '') slug = helpers.get_slug(name) address = '' city = '' phone = '' fax = '' email = '' website = '' description = '' url = response.url or '' for panel in response.css('.panel'): panel_title = panel.css('.col-xs-10.col-sm-11::text').get().strip() if 'Alamat' in panel_title: address = [] for addr in panel.css('.panel-body::text'): address.append(addr.get().strip()) address = ', '.join(address) elif 'Telepon' in panel_title: phones = panel.css('.panel-body::text') if phones is not None: phone = phones[0].get().strip() if len(phones) > 1: fax = phones[1].get().strip() elif 'Website' in panel_title: website = panel.css('.panel-body a::attr(href)').get().strip() if self.allowed_domains[0] in website: website = '' elif 'Email' in panel_title: email = panel.css('.panel-body a::text').get() elif 'Tentang' in panel_title: description = [] for desc in panel.css('.panel-body::text'): description.append(desc.get().strip()) description = ' '.join(description).strip() if len(description) == 0: for desc in panel.css('.panel-body p::text'): desc = desc.get().strip() if len(desc) >= 200: description = desc break # if len(email) == 0: # self.logger.info('{} : EMPTY EMAIL'.format(url)) # if len(phone) == 0: # self.logger.info('{} : EMPTY PHONE'.format(url)) # if len(email) > 0 and len(phone) > 0: yield { 'category': category.strip(), 'name': name.strip(), 'slug': slug.strip(), 'address': address.strip(), 'city': city.strip(), 'phone': phone.strip(), 'fax': fax.strip(), 'email': email.strip(), 'website': website.strip(), 'description': description.strip(), 'url': url.strip(), }