Python textPreprocessing示例

编程语言: Python

命名空间/包名称: spider_functions

方法/功能: textPreprocessing

hotexamples.com的示例: 7

Python textPreprocessing - 已找到7个示例。这些是从开源项目中提取的最受好评的spider_functions.textPreprocessing现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

 def parse(self, response):
     text = response.xpath('//div[@id = "content_box"]').extract_first()
     try:
         text = text.split('</span></div></div>')[1]
     except:
         print('error merkle')
     text = text.split('<script type="text/javascript">')[0]
     try:
         text = text.replace(
             'freestar.queue.push(function () { googletag.display(\'TheMerkle_728x90_320x50_BTF\'); });',
             '')
     except:
         print('error replacing the merkle')
     #text processing
     text = fun.textPreprocessing(text)
     try:
         text = fun.textPreprocessing(text)
     except:
         print('error processing')
     try:
         text = text.replace(
             'freestar queue push function googletag display TheMerkle_728x90_320x50_BTF',
             ' ')
     except:
         print('error 2 replacing')
     #only alphabetic
     try:
         News.update(
             body=text,
             bitcoinBoolean=fun.aboutBitcoin(text),
             ethereumBoolean=fun.aboutEthereum(text),
             finished=True).where(News.link == response.url).execute()
     except:
         print('error storing')

示例#2

显示文件

 def parse(self, response):
     text = response.xpath(
         '//div[@class = "post-full-text contents"]').extract_first()
     #text processing
     text = fun.textPreprocessing(text)
     News.update(body=text,
                 bitcoinBoolean=fun.aboutBitcoin(text),
                 ethereumBoolean=fun.aboutEthereum(text),
                 finished=True).where(News.link == response.url).execute()

示例#3

显示文件

文件： spider_bitcoinmagazine_articles.py 项目： mjuchli/ctc-news-aggregator

 def parse(self, response):
     text = response.xpath('//div[@class = "rich-text"]').extract_first()
     text = text.split('<p class="tagline">')[0]
     #text processing
     text = fun.textPreprocessing(text)
     #only alphabetic
     News.update(body=text,
                 bitcoinBoolean=fun.aboutBitcoin(text),
                 ethereumBoolean=fun.aboutEthereum(text),
                 finished=True).where(News.link == response.url).execute()

示例#4

显示文件

文件： spider_cointelegraph_explained.py 项目： mjuchli/ctc-news-aggregator

 def parse(self, response):
     textParts = response.xpath('//div[@class = "name"]').extract()
     textParts = textParts + response.xpath(
         '//div[@class = "clearfix content"]').extract()
     text = ''
     for part in textParts:
         text = text + unicode(part)
     text = fun.textPreprocessing(text)
     News.update(body=text,
                 bitcoinBoolean=fun.aboutBitcoin(text),
                 ethereumBoolean=fun.aboutEthereum(text),
                 finished=True).where(News.link == response.url).execute()

示例#5

显示文件

    def parse(self, response):
        text = response.xpath(
            '//div[@class = "entry-content"]').extract_first()
        text = text.split('Disclaimer')[0]
        text = text.split('CDATA id15 Content Ad 2 OA_show 15 ')[0]
        #text processing
        text = fun.textPreprocessing(text)

        #only alphabetic
        News.update(body=text,
                    bitcoinBoolean=fun.aboutBitcoin(text),
                    ethereumBoolean=fun.aboutEthereum(text),
                    finished=True).where(News.link == response.url).execute()

示例#6

显示文件

 def parse(self, response):
     text = response.xpath(
         '//div[@class = "article-content-container noskimwords"]'
     ).extract_first()
     #text processing
     text = fun.textPreprocessing(text)
     text = text.split('function e t r n c a l')[0]
     text = text.split('image via ')[0]
     text = text.split('Image via ')[0]
     text = text.split('via Shutter')[0]
     News.update(
         body=text,
         bitcoinBoolean=fun.aboutBitcoin(text),
         ethereumBoolean=fun.aboutEthereum(text),
         finished=True).where(News.link == str(response.url)).execute()

示例#7

显示文件

 def parse(self, response):
     text = response.xpath('//div[@class = "post-info"]').extract_first()
     try:
         text = text.split('<!--Content Ad -->')[2]
     except:
         try:
             text = text.split('<li class="sm-share reddit">')[1]
         except:
             print('error livebitcoinnews')
     text = text.split('<footer class=')[0]
     text = text.split('Header image')[0]
     #text processing
     text = fun.textPreprocessing(text)
     #only alphabetic
     News.update(body=text,
                 bitcoinBoolean=fun.aboutBitcoin(text),
                 ethereumBoolean=fun.aboutEthereum(text),
                 finished=True).where(News.link == response.url).execute()