def parse(self, item, *args, **kwargs): try: tm = textmine.textmine() soup = BeautifulSoup(item['body'], 'html.parser') text = soup.extract().get_text() if self.conf.has_option(self.section, 'exclude_keywords'): exclude_keywords = ast.literal_eval( self.conf.get(self.section, 'exclude_keywords')) for ex_word in exclude_keywords: text = text.replace(ex_word, '') tm_result = tm.get(text) # item.fields["pdate"] = CommonField() # item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00') # if self.start_pdate is None: # self.start_pdate = item["pdate"] # item['body'] = str(item['body']).replace('\n', ' ').strip() # item.fields["text"] = CommonField() # item["text"] = text.replace('\n', ' ').strip() item.fields["top_sentence"] = CommonField() item.fields["top_word"] = CommonField() item.fields["sentences"] = CommonField() item.fields["words"] = CommonField() if len(tm_result) > 0 and len(tm_result[0]) > 0: item["top_sentence"] = str(tm_result[0][0][2]).replace( '\n', ' ').strip() if len(tm_result) > 0 and len(tm_result[1]) > 0: item["top_word"] = str(tm_result[1][0][0]).replace( '\n', ' ').strip() if len(tm_result) > 0: item["sentences"] = str(tm_result[0]).replace('\n', ' ').strip() if len(tm_result) > 1: item["words"] = str(tm_result[1]).replace('\n', ' ').strip() # self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word', 'sentences', 'words', 'text', 'body', 'date', 'section', 'pdate'] self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word', 'sentences', 'words', 'date', 'section' ] yield item except Exception as ex: print(ex)
def parse(self, item, *args, **kwargs): self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'word', 'word_point', 'date', 'section', 'pdate' ] try: tm = textmine.textmine() soup = BeautifulSoup(item['body'], 'html.parser') text = soup.extract().get_text() if self.conf.has_option(self.section, 'exclude_keywords'): exclude_keywords = ast.literal_eval( self.conf.get(self.section, 'exclude_keywords')) for ex_word in exclude_keywords: text = text.replace(ex_word, '') tm_result = tm.get(text) if len(tm_result) > 0 and len(tm_result[1]) > 0: for word in tm_result[1]: new_item = CommonItem() new_item.fields["uuid"] = CommonField() new_item = CommonItem() new_item.fields["domain"] = CommonField() new_item = CommonItem() new_item.fields["url"] = CommonField() new_item = CommonItem() new_item.fields["word"] = CommonField() new_item = CommonItem() new_item.fields["word_point"] = CommonField() new_item = CommonItem() new_item.fields["date"] = CommonField() new_item = CommonItem() new_item.fields["section"] = CommonField() new_item = CommonItem() new_item.fields["pdate"] = CommonField() new_item["encoding"] = item["encoding"] new_item["uuid"] = item["uuid"] new_item["domain"] = item["domain"] new_item["url"] = item["url"] new_item["word"] = word[0] new_item["word_point"] = str(word[1]) new_item["date"] = item["date"] new_item["section"] = item["section"] new_item["pdate"] = datetime.datetime.now().strftime( '%Y%m%d%H%M00') if self.start_pdate is None: self.start_pdate = new_item["pdate"] yield new_item except Exception as ex: print(ex)
def process_item(self, item, spider): item.fields['fields_info'] = CommonField() item.fields['uuid'] = CommonField() item.fields['spider_name'] = CommonField() fields_info = {} for idx, val in enumerate(item.fields): fields_info.setdefault(str(idx), val) item['fields_info'] = fields_info item['uuid'] = str(uuid.uuid1()) item['spider_name'] = str(spider.name) self.exporter.fields_to_export = item.fields.keys() try: self.exporter.export_item(item) except Exception as ex: logger.error("QueueWriterPipeline Exception : %s", str(ex)) return item
def content_parse(self, response, keyword): try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item["encoding"] = response.encoding item.fields["section"] = CommonField() item["section"] = self.section item.fields["keyword"] = CommonField() item["keyword"] = keyword yield item except Exception as ex: self.handler.management_info['current_exception'] = str(ex) self.handler.management_info['spider_err_count'] = self.handler.management_info['spider_err_count'] + 1 pass
def parse(self, item, *args, **kwargs): try: self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate'] self.exporter.insert_query = "INSERT INTO TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');" newbody = {} soup = BeautifulSoup(item['body'], features="lxml") category = soup.select_one('h1.a-size-large.a-spacing-medium.zg-margin-left-15.a-text-bold').extract().get_text() cont_items = soup.select('span.aok-inline-block.zg-item') for i in cont_items: newitem = CommonItem() newitem.fields['body'] = CommonField() newitem.fields['date'] = CommonField() newitem.fields['domain'] = CommonField() newitem.fields['spider_name'] = CommonField() newitem.fields['url'] = CommonField() newitem.fields['uuid'] = CommonField() newitem.fields['section'] = CommonField() newitem.fields["pdate"] = CommonField() title = BeautifulSoup(str(i), features="lxml").select_one('div.p13n-sc-truncate').extract().get_text() price = BeautifulSoup(str(i), features="lxml").select_one('span').extract().get_text() img = BeautifulSoup(str(i), features="lxml").select_one('img').attrs['src'] newbody['category'] = str(category) newbody['title'] = str(title) newbody['price'] = str(price) newbody['image'] = str(img) newitem['body'] = re.escape(str(newbody)).replace("'", " ").replace(",", " ").replace('"', ' ').replace('{', ' ').replace('}', ' ') newitem['date'] = item['date'] newitem['domain'] = item['domain'] newitem['spider_name'] = item['spider_name'] newitem['url'] = item['url'] newitem['uuid'] = str(uuid.uuid1()) newitem['section'] = item['section'] newitem['pdate'] = datetime.datetime.now().strftime('%Y%m%d%H%M00') yield newitem except Exception as ex: print(ex)
def parse(self, item, *args, **kwargs): try: item.fields["pdate"] = CommonField() item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00') item['body'] = re.escape(item['body']).replace("'", "''").replace( ",", " ").replace('\n', ' ') self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate' ] # self.exporter.insert_query = "INSERT INTO TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');" self.exporter.insert_query = "insert into tb_crawling (uuid,domain,url,body,date,section,pdate) values ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');" yield item except Exception as ex: print(ex)
def run(self): if not self.exporter: raise Exception('parser need to define exporter') self.isruning = True idle_time = 0 while self.isruning: self.handler.management_info['parser_opened'] = self.parser_opened if self.handler.get_queue_cnt() > 0: idle_time = 0 if not self.parser_opened: self.open_parser() data = self.handler.dequeue() if data is not b'' and data is not None: import msgpack u_msg = msgpack.unpackb(data, raw=False) item = CommonItem() for k, v in u_msg[-1].items(): if 'fields_info' != v: item.fields[v] = CommonField() item[v] = u_msg[int(k)] if self.exporter: try: parse_generator = self.parse(item) if parse_generator: for p in parse_generator: self.exporter.export_item(p) self.handler.management_info[ 'export_count'] = self.handler.management_info[ 'export_count'] + 1 except Exception as ex: self.handler.management_info[ 'current_exception'] = str(ex) self.handler.management_info[ 'export_err_count'] = self.handler.management_info[ 'export_err_count'] + 1 else: idle_time = idle_time + 1 if idle_time > 60 and self.parser_opened: self.close_parser() time.sleep(1)
def parse_content(self, response, section, url): ext = tldextract.extract(urlparse(response.url).netloc) domain = ext.registered_domain try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.css('div#webContents').getall()[0] if len( response.css( 'div#webContents').getall()) > 0 else response.body item.fields["section"] = CommonField() item["section"] = section yield item except Exception as ex: pass
def parse(self, response, recursive, section): ext = tldextract.extract(urlparse(response.url).netloc) domain = ext.registered_domain ext = DomainPatternLinkExtractor(domain, canonicalize=True, unique=True) urls = [] if recursive: try: if response.headers['Content-Type'] \ and response.headers['Content-Type'].decode("utf-8").lower().find("application") == -1: urls = [link.url for link in ext.extract_links(response)] else: return except Exception as ex: pass for url in urls: yield response.follow(url, self.parse, cb_kwargs={ 'recursive': recursive, 'section': section }) try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = section yield item except Exception as ex: pass
def parse(self, response): ext = tldextract.extract(urlparse(response.url).netloc) domain = ext.registered_domain ext = DomainPatternLinkExtractor(domain, canonicalize=True, unique=True) urls = [] try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = section yield item except Exception as ex: pass
def content_parse_50(self, response): try: params = {} params['pg'] = 2 query_string = urllib.parse.urlencode(params) yield response.follow(url=response.url + "?" + query_string, callback=self.content_parse_100) ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = self.section yield item except Exception as ex: self.handler.management_info['current_exception'] = str(ex) self.handler.management_info[ 'spider_err_count'] = self.handler.management_info[ 'spider_err_count'] + 1 pass
class kitaItem(CommonItem): section = CommonField()
def parse(self, item, *args, **kwargs): try: item.fields["pdate"] = CommonField() item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00') self.data_reg_dt = datetime.datetime.now().strftime('%Y%m%d%H%M00') self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate' ] self.exporter.insert_query = "INSERT INTO TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');" if self.gdp_url == item['url']: self.gdp_res = item['body'] elif self.pop_url == item['url']: self.pop_res = item['body'] if self.pop_res and self.gdp_res: pop_dict = {} for dicts in json.loads(self.pop_res)[1]: NAT_CD = dicts['country']['id'].strip() NAT_NAME = dicts['country']['value'].strip() ISO_WD3_NAT_CD = dicts['countryiso3code'].strip() BASE_YR = dicts['date'].strip() POPLTN_VAL = dicts['value'] pop_dict.setdefault( '|'.join(map(str, [NAT_CD, NAT_NAME, ISO_WD3_NAT_CD])), {}).setdefault(BASE_YR, POPLTN_VAL) # print('%s\n' % '|^|'.join(map(str, self.header_list))) for dicts in json.loads(self.gdp_res)[1]: NAT_CD = dicts['country']['id'].strip() NAT_NAME = dicts['country']['value'].strip() ISO_WD3_NAT_CD = dicts['countryiso3code'].strip() BASE_YR = dicts['date'].strip() GDP_VAL = dicts['value'] try: POPLTN_VAL = pop_dict['|'.join( map(str, [NAT_CD, NAT_NAME, ISO_WD3_NAT_CD]))][BASE_YR] except: POPLTN_VAL = '' res_line = [ NAT_CD, NAT_NAME, ISO_WD3_NAT_CD, BASE_YR, GDP_VAL, POPLTN_VAL, self.data_reg_dt ] # print('%s\n' % '|^|'.join(map(str, res_line))) item['body'] = '%s' % '|^|'.join(map(str, res_line)) yield item del pop_dict['|'.join( map(str, [NAT_CD, NAT_NAME, ISO_WD3_NAT_CD]))][BASE_YR] for dicts in pop_dict: for BASE_YR in pop_dict[dicts]: # __LOG__.Watch(dicts) # __LOG__.Watch(BASE_YR) # __LOG__.Watch(pop_dict[dicts][BASE_YR]) POPLTN_VAL = pop_dict[dicts][BASE_YR] NAT_CD, NAT_NAME, ISO_WD3_NAT_CD = dicts.split('|') GDP_VAL = '' res_line = [ NAT_CD, NAT_NAME, ISO_WD3_NAT_CD, BASE_YR, GDP_VAL, POPLTN_VAL, self.data_reg_dt ] # print('%s\n' % '|^|'.join(map(str, res_line))) item['body'] = '%s' % '|^|'.join(map(str, res_line)) yield item self.gdp_res = None self.pop_res = None except Exception as ex: print(ex)