def selective_extractor((url, raw_content, selector, selector_type)): logger.debug('Start selective_extractor: %s' % url) result = '' elem = '' try: tree = etree.HTML(raw_content) if selector_type == 'xpath': elem = tree.xpath(selector) elif selector_type == 'css': css_selector = CSSSelector(selector) elem = css_selector(tree) if type(elem) is list: for e in elem: result += ' '.join( get_unicode(x.text) for x in e.iter() if x.text) + ' ' else: result = ' '.join( get_unicode(x.text) for x in elem.iter() if x.text) except Exception as ex: logger.exception('selector extractor error: %s' % ex.message) logger.error('url: %s' % url) logger.debug('End selective_extractor: %s' % url) return url, result
def goose_dragnet_extractor((url, raw_content)): logger.debug('Start goose_dragnet_extractor: %s' % url) content = '' try: content = content_comments_extractor.analyze(raw_content) except Exception as ex: logger.error('dragnet extract page content and comment error: %s' % ex) meta_text = '' try: if raw_content and raw_content.strip(): try: doc = get_goose_doc(raw_content) title = get_goose_content(url, doc, 'title') meta_description = get_goose_content(url, doc, 'meta_description') meta_keywords = get_goose_content(url, doc, 'meta_keywords') if not content: content = get_goose_content(url, doc, 'cleaned_text') meta_text = ', '.join(c for c in [ get_unicode(title), get_unicode(meta_description), get_unicode(meta_keywords) ] if c) except Exception as ex: logger.error('get_goose_doc error: %s' % ex.message) logger.error('Url: %s' % url) except Exception as ex: logger.error('goose extract_page_content error: %s' % ex) logger.error('url: %s' % url) result = ', '.join(c for c in [get_unicode(content), meta_text] if c) logger.debug('End goose_dragnet_extractor: %s' % url) return url, result
def process(self, pages): self.logger.debug('Start extract pages: %s' % pages.keys()) item_num = len(pages) if item_num > 10: # get function func = dragnet_extractor if isinstance(self, DragnetPageExtractor): func = dragnet_extractor elif isinstance(self, ReadabilityPageExtractor): func = readability_extractor elif isinstance(self, GoosePageExtractor): func = goose_extractor elif isinstance(self, GooseDragnetPageExtractor): func = goose_dragnet_extractor elif isinstance(self, SelectivePageExtractor): func = selective_extractor elif isinstance(self, AllTextPageExtractor): func = all_text_extractor # use multi thread to crawl pages pool = Pool(cpu_count()) if isinstance(self, SelectivePageExtractor): data = [(get_unicode(url), page.get('content', ''), self.selector, self.selector_type) for url, page in pages.items() if page.get('content')] else: data = [(get_unicode(url), page.get('content', '')) for url, page in pages.items() if page.get('ok') and page.get('content')] pool_results = pool.map(func, data) # get results for r in pool_results: pages[r[0]]['content'] = r[1] pool.close() pool.terminate() for url, page in pages.items(): if not page['content']: page['content'] = url continue page['content'] = ', '.join(c for c in [page['content']] if c) else: for url, page in pages.items(): if not page['content']: page['content'] = url continue page['content'] = ', '.join( c for c in [self.extract((url, page['content']))[1]] if c) self.logger.debug('End extract pages: %s' % pages.keys()) return pages
def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False if isinstance(element, Comment): return False elif re.match(r'<!--.*-->', get_unicode(element)): return False return True
def dragnet_extractor((url, raw_content)): logger.debug('Start dragnet_extractor: %s' % url) content = '' try: content = content_comments_extractor.analyze(raw_content) except Exception as ex: logger.error('dragnet extract page content and comment error: %s' % ex) logger.error('url: %s' % url) result = '' try: elements = get_common_info(raw_content) elements.append(get_unicode(content)) result = ', '.join(get_unicode(c) for c in elements if c) except Exception as ex: logger.error('Unicode issue: %s' % ex.message) logger.debug('End dragnet_extractor: %s' % url) return url, result
def get_soup_meta(soup, name): metas = soup.findAll('meta') for meta in metas: element_name = str(meta.get('name')) if not element_name: element_name = str(meta.get('property')) if re.findall(name, element_name, re.IGNORECASE): return get_unicode(meta.get('content', '')) return u''
def get_common_info(raw_html): try: soup = BeautifulSoup(raw_html, 'lxml') title = soup.title.string if soup.title else u'' title = get_unicode(title) if title else u'' description = get_soup_meta(soup, 'description') keywords = get_soup_meta(soup, 'keywords') except Exception as ex: return [] return [e for e in [title, description, keywords] if e]
def process(self, urls): result = {} urls = list(set(urls)) if self.redis: # Get crawled pages for url in urls: page = self.redis.get(url) if not page: continue self.logger.debug('Url was crawled: %s', url) result[url] = json.loads(get_unicode(page)) self.logger.info("Num of crawled urls: %s" % len(result)) # filter crawled page urls = [u for u in urls if u not in result] self.logger.info("Remain haven't crawled urls: %s" % len(urls)) if not urls: self.logger.info('All urls has been crawled') return result # Crawl new urls if len(urls) > 2: # use multi thread to crawl pages pool = Pool(cpu_count() * 2) pool_results = pool.map(self._crawl_page, urls) # get results for r in pool_results: result.update(r) pool.terminate() else: for url in urls: result.update(self._crawl_page(url)) if self.redis: # Cache result for url in urls: page = result[url] page['crawled_date'] = datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') self.redis.set(url, json.dumps(page, ensure_ascii=False, encoding='utf-8'), ex=self.expire_time) return result
def get_common_info(url, raw_html): try: soup = build_sup(raw_html) title = soup.title.string if soup.title else u'' title = get_unicode(title) if title else u'' description = get_soup_meta(soup, 'description') keywords = get_soup_meta(soup, 'keywords') except Exception as ex: logger.exception('Error when get common info') return [] return [ e for e in [title, description, keywords, get_text_from_url(url)] if e ]
def get_text_from_url(url): try: parse_result = urlparse(url) if not parse_result: return '' path = ' '.join([t.strip() for t in parse_result.path.split('/') if t and t.strip() and '.' not in t]) \ if parse_result.path else '' path = re.sub(r'[^A-Za-z0-9]', ' ', path) netloc_parts = parse_result.netloc.replace('www.', '').split('.') root_name = netloc_parts[0] if len(netloc_parts) > 0 else '' return get_unicode(root_name + ' ' + path) except Exception as ex: logger.exception('Error when get text from url') return ''
def readability_extractor((url, raw_content)): logger.debug('Start readability_extractor: %s' % url) content = '' try: doc = Document(raw_content) content = doc.summary() except Exception as ex: logger.error('readability extract_page_content error: %s' % ex) logger.error('url: %s' % url) elements = get_common_info(raw_content) elements.append(get_unicode(content)) result = ', '.join(c for c in elements if c) logger.debug('End readability_extractor: %s' % url) return result
def process_job(df, selected_dm, unit, min_ngram, max_ngram, job_id, output_file): columns = list(df.columns.values) distance_cols = gen_distance_cols(columns) for col in distance_cols: df[col] = '' redis.hset(job_id, 'size', len(df.index)) redis.hset(job_id, 'start', time.time()) redis.hset(job_id, 'file', output_file) redis.hset(job_id, 'finish', 0) redis.hset(job_id, 'ok', 'true') redis.hset(job_id, 'error', '') try: tasks = [(tuple(get_unicode(row[col]) for col in columns), selected_dm, unit, min_ngram, max_ngram, job_id) for idx, row in df.iterrows()] pool = Pool(cpu_count()) result = pool.map(cross_check_similarity_wrapper, tasks) pool.close() pool.terminate() for idx, row in df.iterrows(): for dist_idx, col in enumerate(distance_cols): df.loc[idx, col] = result[idx][dist_idx] df.to_csv(os.path.join(app.config['UPLOAD_FOLDER'], output_file), index=False, sep='\t', encoding='utf-8') redis.hset(job_id, 'finish', 1) except UnicodeEncodeError as e: redis.hset(job_id, 'ok', 'false') redis.hset(job_id, 'error', 'Input file should be in UTF-8 format, detail: %s' % e) logger.exception(e) except Exception as e: redis.hset(job_id, 'ok', 'false') redis.hset(job_id, 'error', '%s' % e.message) logger.exception(e)
def all_text_extractor((url, raw_content)): logger.debug('Start all_text_extractor: %s' % url) result = '' try: soup = build_sup(raw_content) texts = soup.findAll(text=True) # Get all visible text visible_texts = filter(visible, texts) # Get common info common_texts = get_common_info(url, raw_content) all_texts = common_texts + visible_texts result = ', '.join( get_unicode(t.strip()) for t in all_texts if t and t.strip()) except Exception as ex: logger.exception('All text extractor: %s' % ex.message) logger.debug('End all_text_extractor: %s' % url) return url, result
def _get_exist_voc(self, vocs, index_name, doc_type): # get existed vocabulary existed_voc = set() query = { 'query': { 'filtered': { 'filter': { 'terms': { '_id': vocs } } } } } hits = scan(client=self.es, query=query, index=index_name, doc_type=doc_type) for hit in hits: existed_voc.add(get_unicode(hit['_source']['voc'])) return existed_voc
def _crawl_page(self, url): self.logger.debug('Start crawl %s...' % url) result = {'content': '', 'error': False, 'message': ''} if url: # check database page = self.storage.find_one({'_id': url}) if page and page.get('crawled_date'): self.logger.debug('Page was crawled (2nd check): ' + page['_id']) return {url: self.storage.find_one({'_id': url})} try: headers = {'User-Agent': self.user_agent} response = requests.get(url, verify=False, timeout=5, headers=headers) # raise exception when something error if response.status_code == requests.codes.ok: result['content'] = response.content else: result['error'] = True result['message'] = 'Page not found' except Exception as ex: self.logger.error('crawl_page error: %s' % ex.message) result['error'] = True result['message'] = str(ex.message) # 'Page not found' else: result['error'] = True result['message'] = 'url is empty' # storage to database result['_id'] = url result['crawled_date'] = datetime.utcnow() result['content'] = get_unicode(result['content']) self.logger.info('Update crawled page to db...') self.storage.update_one({'_id': url}, {'$set': result}, upsert=True) self.logger.debug('End crawl %s...' % url) return {url: self.storage.find_one({'_id': url})}
def goose_extractor((url, raw_content)): logger.debug('Start goose_extractor: %s' % url) result = '' try: if raw_content and raw_content.strip(): try: doc = get_goose_doc(raw_content) cleaned_text = get_goose_content(url, doc, 'cleaned_text') elements = get_common_info(url, raw_content) elements.append(get_unicode(cleaned_text)) result = ', '.join(c for c in elements if c) except Exception as ex: logger.exception('get_goose_doc error: %s' % ex.message) logger.error('Url: %s' % url) except Exception as ex: logger.exception('goose extract_page_content timeout error: %s' % ex.message) logger.error('url: %s' % url) logger.debug('End goose_extractor: %s' % url) return url, result
def _normalize(text): return re.sub(r'\s+', ' ', get_unicode(text).strip().lower())