Python CxExtractor примеры использования

Язык программирования: Python

Пространство имен/Пакет: CxExtractor

Класс/Тип: CxExtractor

Примеров на hotexamples.com: 10

Python CxExtractor - 10 примеров найдено. Это лучшие примеры Python кода для CxExtractor.CxExtractor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CxExtractor(6)

getHtml(4)

getText(3)

readHtml(3)

crawl(2)

filter_tags(2)

Пример #1

Показать файл

Файл: search_engine.py Проект: Wind5/Spider

 def parse_content(self, response):
     if self.__num_of_done >= self.__num_of_wanted:
         return
     cx = CxExtractor(threshold=186)
     print response.url
     if cx.crawl(response.url, response.body, self.__path,
                 response.meta['filename'], response.meta['engine'],
                 response.meta['rank']) is not None:
         # self.__lock.acquire()
         self.__num_of_done += 1
         print self.__num_of_done

Пример #2

Показать файл

 def parse(self, response):
     soup = BeautifulSoup(response.body, 'html.parser')
     # links = soup.find_all('a', href=re.compile('http'))
     links = soup.find_all('a', href=True)
     num_a = len(links)
     url = response.url.encode('utf-8')
     print response.url
     cx = CxExtractor(threshold=36)
     print cx.crawl(response.url, response.body, self.__path, url)
     # print 'num_a: ' + str(num_a)
     for link in links:
         print '    ' + link['href'].encode('utf-8')
         yield response.follow(link['href'], self.parse)

Пример #3

Показать файл

 def WebCrawl(self, html_list):
     output = list()
     cx = CxExtractor(threshold=180, blocksWidth=3)
     for html in html_list:
         try:
             test_html = cx.getHtml(html)
             content = cx.filter_tags(test_html)
             s = cx.getText(content)
             print(html)
         except:
             print("Extract Abnomally")
         if s != "This page has no content to extract":
             output.append(s)
     return output

Пример #4

Показать файл

Файл: views.py Проект: cjmyezi/revised_search_platform

def index(request):
    actual_url = request.get_full_path(
    )[request.get_full_path().index('index/') + 6:]
    flag, detailPage = get_saved_page(actual_url)
    if flag:
        html = detailPage.html
    else:
        cx = CxExtractor(threshold=186)
        html = cx.getHtml(actual_url)

        detailPage = DetailPage()
        detailPage.html = html
        detailPage.url = actual_url
        detailPage.time = datetime.datetime.now()
        detailPage.save()

    return HttpResponse(html)

Пример #5

Показать файл

from CxExtractor import CxExtractor
cx = CxExtractor(threshold=186)
# html = cx.getHtml("http://www.bbc.com/news/world-europe-40885324")
html = cx.getHtml("http://news.163.com/17/0810/09/CRFF02Q100018AOR.html")
content = cx.filter_tags(html)
s = cx.getText(content)
print(s)

Пример #6

Показать файл

"""
一个HTML文件，找出里面的正文。
"""
from CxExtractor import CxExtractor
cx = CxExtractor()

#从读取内容
#html = cx.getHtml("https://github.com/Yixiaohan/show-me-the-code")
html = cx.readHtml("show-me-the-code.html", 'utf-8')

#过滤干扰标签
content = cx.filter_tags(html)

print(cx.getText(content))

Пример #7

Показать файл

import os
from CxExtractor import CxExtractor
files = os.listdir('./bbcnews-html')
cx = CxExtractor(threshold=186)
for f in files:
    text = ''
    html = cx.readHtml('./bbcnews-html/' + f, 'utf-8')
    content = cx.filter_tags(html)
    text = cx.getText(content)
    with open('./bbcnews-text/' + f.split('.')[0] + '.txt',
              'w',
              encoding='utf-8') as textfile:
        textfile.write(text)

Пример #8

Показать файл

# -*- coding: utf8 -*-

from CxExtractor import CxExtractor
cx = CxExtractor(threshold=186)
# html = cx.getHtml("https://www.dianping.com/home-tuku/k3?utm_source=pc_shouye_smalltuku5")
html = cx.readHtml("Text/001.html")
content = cx.filter_tags(html)
s = cx.getText(content)
print(s)

Пример #9

Показать файл

Файл: download_all_news.py Проект: davidyuqiwei/davidyu_stock

import time


stock_news = { 
    'sina': "https://finance.sina.com.cn/stock/",
    "dfcf": "https://finance.sina.com.cn/stock/",
    "ifeng": "http://finance.ifeng.com/",
    "ftchinese": "http://www.ftchinese.com/channel/economy.html",
    "nature_researchAnalysis": "https://www.nature.com/research-analysis",
    "ifeng": "http://finance.ifeng.com/",
    "zdnet": "https://www.zdnet.com/"
}



cx = CxExtractor(threshold=86)
# html = cx.getHtml("http://www.bbc.com/news/world-europe-40885324")
dir_all_news=data_dict.get("all_news")
print(dir_all_news)

def get_text(key):
    raw_html = stock_news.get(key)
    html = cx.getHtml(raw_html)
    content = cx.filter_tags(html)
    s = cx.getText(content)
    #print(s)
    # save dir
    dir_all_news=data_dict.get("all_news")
    today=time.strftime("%Y-%m-%d", time.localtime())
    file_name = key+"_"+today+".txt"
    files=os.path.join(dir_all_news,file_name)

Пример #10

Показать файл

Файл: test.py Проект: NAMEs/davidyu_stock

from CxExtractor import CxExtractor
import sys
sys.path.append("../")
from dir_control.data_dir_v1 import data_dict, stk_index_list
import os

cx = CxExtractor(threshold=186)
# html = cx.getHtml("http://www.bbc.com/news/world-europe-40885324")
html = cx.getHtml("https://finance.sina.com.cn/stock/")
#html = cx.getHtml("http://www.eastmoney.com/")
content = cx.filter_tags(html)
s = cx.getText(content)
#print(s)
dir_all_news = data_dict.get("all_news")
files = os.path.join(dir_all_news, "test.txt")
f = open(files, 'w+')

print(s, file=f)