Пример #1
0
 def parse_content(self, response):
     if self.__num_of_done >= self.__num_of_wanted:
         return
     cx = CxExtractor(threshold=186)
     print response.url
     if cx.crawl(response.url, response.body, self.__path,
                 response.meta['filename'], response.meta['engine'],
                 response.meta['rank']) is not None:
         # self.__lock.acquire()
         self.__num_of_done += 1
         print self.__num_of_done
Пример #2
0
 def parse(self, response):
     soup = BeautifulSoup(response.body, 'html.parser')
     # links = soup.find_all('a', href=re.compile('http'))
     links = soup.find_all('a', href=True)
     num_a = len(links)
     url = response.url.encode('utf-8')
     print response.url
     cx = CxExtractor(threshold=36)
     print cx.crawl(response.url, response.body, self.__path, url)
     # print 'num_a: ' + str(num_a)
     for link in links:
         print '    ' + link['href'].encode('utf-8')
         yield response.follow(link['href'], self.parse)
Пример #3
0
 def WebCrawl(self, html_list):
     output = list()
     cx = CxExtractor(threshold=180, blocksWidth=3)
     for html in html_list:
         try:
             test_html = cx.getHtml(html)
             content = cx.filter_tags(test_html)
             s = cx.getText(content)
             print(html)
         except:
             print("Extract Abnomally")
         if s != "This page has no content to extract":
             output.append(s)
     return output
Пример #4
0
def index(request):
    actual_url = request.get_full_path(
    )[request.get_full_path().index('index/') + 6:]
    flag, detailPage = get_saved_page(actual_url)
    if flag:
        html = detailPage.html
    else:
        cx = CxExtractor(threshold=186)
        html = cx.getHtml(actual_url)

        detailPage = DetailPage()
        detailPage.html = html
        detailPage.url = actual_url
        detailPage.time = datetime.datetime.now()
        detailPage.save()

    return HttpResponse(html)
Пример #5
0
from CxExtractor import CxExtractor
cx = CxExtractor(threshold=186)
# html = cx.getHtml("http://www.bbc.com/news/world-europe-40885324")
html = cx.getHtml("http://news.163.com/17/0810/09/CRFF02Q100018AOR.html")
content = cx.filter_tags(html)
s = cx.getText(content)
print(s)
Пример #6
0
"""
一个HTML文件,找出里面的正文。
"""
from CxExtractor import CxExtractor
cx = CxExtractor()

#从读取内容
#html = cx.getHtml("https://github.com/Yixiaohan/show-me-the-code")
html = cx.readHtml("show-me-the-code.html", 'utf-8')

#过滤干扰标签
content = cx.filter_tags(html)

print(cx.getText(content))
Пример #7
0
import os
from CxExtractor import CxExtractor
files = os.listdir('./bbcnews-html')
cx = CxExtractor(threshold=186)
for f in files:
    text = ''
    html = cx.readHtml('./bbcnews-html/' + f, 'utf-8')
    content = cx.filter_tags(html)
    text = cx.getText(content)
    with open('./bbcnews-text/' + f.split('.')[0] + '.txt',
              'w',
              encoding='utf-8') as textfile:
        textfile.write(text)
Пример #8
0
# -*- coding: utf8 -*-

from CxExtractor import CxExtractor
cx = CxExtractor(threshold=186)
# html = cx.getHtml("https://www.dianping.com/home-tuku/k3?utm_source=pc_shouye_smalltuku5")
html = cx.readHtml("Text/001.html")
content = cx.filter_tags(html)
s = cx.getText(content)
print(s)
import time


stock_news = { 
    'sina': "https://finance.sina.com.cn/stock/",
    "dfcf": "https://finance.sina.com.cn/stock/",
    "ifeng": "http://finance.ifeng.com/",
    "ftchinese": "http://www.ftchinese.com/channel/economy.html",
    "nature_researchAnalysis": "https://www.nature.com/research-analysis",
    "ifeng": "http://finance.ifeng.com/",
    "zdnet": "https://www.zdnet.com/"
}



cx = CxExtractor(threshold=86)
# html = cx.getHtml("http://www.bbc.com/news/world-europe-40885324")
dir_all_news=data_dict.get("all_news")
print(dir_all_news)

def get_text(key):
    raw_html = stock_news.get(key)
    html = cx.getHtml(raw_html)
    content = cx.filter_tags(html)
    s = cx.getText(content)
    #print(s)
    # save dir
    dir_all_news=data_dict.get("all_news")
    today=time.strftime("%Y-%m-%d", time.localtime())
    file_name = key+"_"+today+".txt"
    files=os.path.join(dir_all_news,file_name)
Пример #10
0
from CxExtractor import CxExtractor
import sys
sys.path.append("../")
from dir_control.data_dir_v1 import data_dict, stk_index_list
import os

cx = CxExtractor(threshold=186)
# html = cx.getHtml("http://www.bbc.com/news/world-europe-40885324")
html = cx.getHtml("https://finance.sina.com.cn/stock/")
#html = cx.getHtml("http://www.eastmoney.com/")
content = cx.filter_tags(html)
s = cx.getText(content)
#print(s)
dir_all_news = data_dict.get("all_news")
files = os.path.join(dir_all_news, "test.txt")
f = open(files, 'w+')

print(s, file=f)