예제 #1
0
def selective_extractor((url, raw_content, selector, selector_type)):
    logger.debug('Start selective_extractor: %s' % url)
    result = ''
    elem = ''
    try:
        tree = etree.HTML(raw_content)
        if selector_type == 'xpath':
            elem = tree.xpath(selector)
        elif selector_type == 'css':
            css_selector = CSSSelector(selector)
            elem = css_selector(tree)

        if type(elem) is list:
            for e in elem:
                result += ' '.join(
                    get_unicode(x.text) for x in e.iter() if x.text) + ' '
        else:
            result = ' '.join(
                get_unicode(x.text) for x in elem.iter() if x.text)

    except Exception as ex:
        logger.exception('selector extractor error: %s' % ex.message)
        logger.error('url: %s' % url)

    logger.debug('End selective_extractor: %s' % url)
    return url, result
예제 #2
0
def goose_dragnet_extractor((url, raw_content)):
    logger.debug('Start goose_dragnet_extractor: %s' % url)
    content = ''
    try:
        content = content_comments_extractor.analyze(raw_content)
    except Exception as ex:
        logger.error('dragnet extract page content and comment error: %s' % ex)

    meta_text = ''
    try:
        if raw_content and raw_content.strip():
            try:
                doc = get_goose_doc(raw_content)
                title = get_goose_content(url, doc, 'title')
                meta_description = get_goose_content(url, doc,
                                                     'meta_description')
                meta_keywords = get_goose_content(url, doc, 'meta_keywords')
                if not content:
                    content = get_goose_content(url, doc, 'cleaned_text')
                meta_text = ', '.join(c for c in [
                    get_unicode(title),
                    get_unicode(meta_description),
                    get_unicode(meta_keywords)
                ] if c)
            except Exception as ex:
                logger.error('get_goose_doc error: %s' % ex.message)
                logger.error('Url: %s' % url)

    except Exception as ex:
        logger.error('goose extract_page_content error: %s' % ex)
        logger.error('url: %s' % url)

    result = ', '.join(c for c in [get_unicode(content), meta_text] if c)
    logger.debug('End goose_dragnet_extractor: %s' % url)
    return url, result
예제 #3
0
    def process(self, pages):
        self.logger.debug('Start extract pages: %s' % pages.keys())
        item_num = len(pages)
        if item_num > 10:
            # get function
            func = dragnet_extractor
            if isinstance(self, DragnetPageExtractor):
                func = dragnet_extractor
            elif isinstance(self, ReadabilityPageExtractor):
                func = readability_extractor
            elif isinstance(self, GoosePageExtractor):
                func = goose_extractor
            elif isinstance(self, GooseDragnetPageExtractor):
                func = goose_dragnet_extractor
            elif isinstance(self, SelectivePageExtractor):
                func = selective_extractor
            elif isinstance(self, AllTextPageExtractor):
                func = all_text_extractor
            # use multi thread to crawl pages
            pool = Pool(cpu_count())
            if isinstance(self, SelectivePageExtractor):
                data = [(get_unicode(url), page.get('content', ''),
                         self.selector, self.selector_type)
                        for url, page in pages.items() if page.get('content')]
            else:
                data = [(get_unicode(url), page.get('content', ''))
                        for url, page in pages.items()
                        if page.get('ok') and page.get('content')]
            pool_results = pool.map(func, data)
            # get results
            for r in pool_results:
                pages[r[0]]['content'] = r[1]

            pool.close()
            pool.terminate()
            for url, page in pages.items():
                if not page['content']:
                    page['content'] = url
                    continue
                page['content'] = ', '.join(c for c in [page['content']] if c)
        else:
            for url, page in pages.items():
                if not page['content']:
                    page['content'] = url
                    continue
                page['content'] = ', '.join(
                    c for c in [self.extract((url, page['content']))[1]] if c)

        self.logger.debug('End extract pages: %s' % pages.keys())
        return pages
예제 #4
0
def visible(element):
    if element.parent.name in [
            'style', 'script', '[document]', 'head', 'title'
    ]:
        return False
    if isinstance(element, Comment):
        return False
    elif re.match(r'<!--.*-->', get_unicode(element)):
        return False
    return True
예제 #5
0
def dragnet_extractor((url, raw_content)):
    logger.debug('Start dragnet_extractor: %s' % url)
    content = ''
    try:
        content = content_comments_extractor.analyze(raw_content)
    except Exception as ex:
        logger.error('dragnet extract page content and comment error: %s' % ex)
        logger.error('url: %s' % url)

    result = ''
    try:
        elements = get_common_info(raw_content)
        elements.append(get_unicode(content))
        result = ', '.join(get_unicode(c) for c in elements if c)
    except Exception as ex:
        logger.error('Unicode issue: %s' % ex.message)

    logger.debug('End dragnet_extractor: %s' % url)
    return url, result
예제 #6
0
def get_soup_meta(soup, name):
    metas = soup.findAll('meta')
    for meta in metas:
        element_name = str(meta.get('name'))
        if not element_name:
            element_name = str(meta.get('property'))
        if re.findall(name, element_name, re.IGNORECASE):
            return get_unicode(meta.get('content', ''))

    return u''
예제 #7
0
def get_common_info(raw_html):
    try:
        soup = BeautifulSoup(raw_html, 'lxml')
        title = soup.title.string if soup.title else u''
        title = get_unicode(title) if title else u''
        description = get_soup_meta(soup, 'description')
        keywords = get_soup_meta(soup, 'keywords')
    except Exception as ex:
        return []

    return [e for e in [title, description, keywords] if e]
예제 #8
0
    def process(self, urls):
        result = {}
        urls = list(set(urls))

        if self.redis:
            # Get crawled pages
            for url in urls:
                page = self.redis.get(url)
                if not page:
                    continue
                self.logger.debug('Url was crawled: %s', url)
                result[url] = json.loads(get_unicode(page))

            self.logger.info("Num of crawled urls: %s" % len(result))
            # filter crawled page
            urls = [u for u in urls if u not in result]

            self.logger.info("Remain haven't crawled urls: %s" % len(urls))

            if not urls:
                self.logger.info('All urls has been crawled')
                return result

        # Crawl new urls
        if len(urls) > 2:
            # use multi thread to crawl pages
            pool = Pool(cpu_count() * 2)
            pool_results = pool.map(self._crawl_page, urls)
            # get results
            for r in pool_results:
                result.update(r)

            pool.terminate()
        else:
            for url in urls:
                result.update(self._crawl_page(url))

        if self.redis:
            # Cache result
            for url in urls:
                page = result[url]
                page['crawled_date'] = datetime.utcnow().strftime(
                    '%Y-%m-%d %H:%M:%S')
                self.redis.set(url,
                               json.dumps(page,
                                          ensure_ascii=False,
                                          encoding='utf-8'),
                               ex=self.expire_time)

        return result
예제 #9
0
def get_common_info(url, raw_html):
    try:
        soup = build_sup(raw_html)
        title = soup.title.string if soup.title else u''
        title = get_unicode(title) if title else u''
        description = get_soup_meta(soup, 'description')
        keywords = get_soup_meta(soup, 'keywords')
    except Exception as ex:
        logger.exception('Error when get common info')
        return []

    return [
        e for e in [title, description, keywords,
                    get_text_from_url(url)] if e
    ]
예제 #10
0
def get_text_from_url(url):
    try:
        parse_result = urlparse(url)
        if not parse_result:
            return ''
        path = ' '.join([t.strip() for t in parse_result.path.split('/') if t and t.strip() and '.' not in t]) \
            if parse_result.path else ''
        path = re.sub(r'[^A-Za-z0-9]', ' ', path)
        netloc_parts = parse_result.netloc.replace('www.', '').split('.')
        root_name = netloc_parts[0] if len(netloc_parts) > 0 else ''
        return get_unicode(root_name + ' ' + path)
    except Exception as ex:
        logger.exception('Error when get text from url')

    return ''
예제 #11
0
def readability_extractor((url, raw_content)):
    logger.debug('Start readability_extractor: %s' % url)
    content = ''
    try:
        doc = Document(raw_content)
        content = doc.summary()
    except Exception as ex:
        logger.error('readability extract_page_content error: %s' % ex)
        logger.error('url: %s' % url)

    elements = get_common_info(raw_content)
    elements.append(get_unicode(content))
    result = ', '.join(c for c in elements if c)
    logger.debug('End readability_extractor: %s' % url)
    return result
예제 #12
0
def process_job(df, selected_dm, unit, min_ngram, max_ngram, job_id,
                output_file):
    columns = list(df.columns.values)
    distance_cols = gen_distance_cols(columns)

    for col in distance_cols:
        df[col] = ''

    redis.hset(job_id, 'size', len(df.index))
    redis.hset(job_id, 'start', time.time())
    redis.hset(job_id, 'file', output_file)
    redis.hset(job_id, 'finish', 0)
    redis.hset(job_id, 'ok', 'true')
    redis.hset(job_id, 'error', '')

    try:
        tasks = [(tuple(get_unicode(row[col]) for col in columns), selected_dm,
                  unit, min_ngram, max_ngram, job_id)
                 for idx, row in df.iterrows()]

        pool = Pool(cpu_count())
        result = pool.map(cross_check_similarity_wrapper, tasks)
        pool.close()
        pool.terminate()

        for idx, row in df.iterrows():
            for dist_idx, col in enumerate(distance_cols):
                df.loc[idx, col] = result[idx][dist_idx]

        df.to_csv(os.path.join(app.config['UPLOAD_FOLDER'], output_file),
                  index=False,
                  sep='\t',
                  encoding='utf-8')
        redis.hset(job_id, 'finish', 1)

    except UnicodeEncodeError as e:
        redis.hset(job_id, 'ok', 'false')
        redis.hset(job_id, 'error',
                   'Input file should be in UTF-8 format, detail: %s' % e)
        logger.exception(e)
    except Exception as e:
        redis.hset(job_id, 'ok', 'false')
        redis.hset(job_id, 'error', '%s' % e.message)
        logger.exception(e)
예제 #13
0
def all_text_extractor((url, raw_content)):
    logger.debug('Start all_text_extractor: %s' % url)
    result = ''
    try:
        soup = build_sup(raw_content)
        texts = soup.findAll(text=True)
        # Get all visible text
        visible_texts = filter(visible, texts)
        # Get common info
        common_texts = get_common_info(url, raw_content)

        all_texts = common_texts + visible_texts
        result = ', '.join(
            get_unicode(t.strip()) for t in all_texts if t and t.strip())
    except Exception as ex:
        logger.exception('All text extractor: %s' % ex.message)

    logger.debug('End all_text_extractor: %s' % url)
    return url, result
예제 #14
0
    def _get_exist_voc(self, vocs, index_name, doc_type):
        # get existed vocabulary
        existed_voc = set()
        query = {
            'query': {
                'filtered': {
                    'filter': {
                        'terms': {
                            '_id': vocs
                        }
                    }
                }
            }
        }
        hits = scan(client=self.es, query=query, index=index_name, doc_type=doc_type)
        for hit in hits:
            existed_voc.add(get_unicode(hit['_source']['voc']))

        return existed_voc
예제 #15
0
    def _crawl_page(self, url):
        self.logger.debug('Start crawl %s...' % url)
        result = {'content': '', 'error': False, 'message': ''}
        if url:
            # check database
            page = self.storage.find_one({'_id': url})
            if page and page.get('crawled_date'):
                self.logger.debug('Page was crawled (2nd check): ' +
                                  page['_id'])
                return {url: self.storage.find_one({'_id': url})}

            try:
                headers = {'User-Agent': self.user_agent}
                response = requests.get(url,
                                        verify=False,
                                        timeout=5,
                                        headers=headers)
                # raise exception when something error
                if response.status_code == requests.codes.ok:
                    result['content'] = response.content
                else:
                    result['error'] = True
                    result['message'] = 'Page not found'

            except Exception as ex:
                self.logger.error('crawl_page error: %s' % ex.message)
                result['error'] = True
                result['message'] = str(ex.message)  # 'Page not found'
        else:
            result['error'] = True
            result['message'] = 'url is empty'

        # storage to database
        result['_id'] = url
        result['crawled_date'] = datetime.utcnow()
        result['content'] = get_unicode(result['content'])
        self.logger.info('Update crawled page to db...')
        self.storage.update_one({'_id': url}, {'$set': result}, upsert=True)
        self.logger.debug('End crawl %s...' % url)
        return {url: self.storage.find_one({'_id': url})}
예제 #16
0
def goose_extractor((url, raw_content)):
    logger.debug('Start goose_extractor: %s' % url)
    result = ''
    try:
        if raw_content and raw_content.strip():
            try:
                doc = get_goose_doc(raw_content)
                cleaned_text = get_goose_content(url, doc, 'cleaned_text')
                elements = get_common_info(url, raw_content)
                elements.append(get_unicode(cleaned_text))
                result = ', '.join(c for c in elements if c)
            except Exception as ex:
                logger.exception('get_goose_doc error: %s' % ex.message)
                logger.error('Url: %s' % url)

    except Exception as ex:
        logger.exception('goose extract_page_content timeout error: %s' %
                         ex.message)
        logger.error('url: %s' % url)

    logger.debug('End goose_extractor: %s' % url)
    return url, result
예제 #17
0
 def _normalize(text):
     return re.sub(r'\s+', ' ', get_unicode(text).strip().lower())