Python Scraper示例，scrapers.Scraper Python示例

示例#1

0

显示文件

文件： frontend.py 项目： tinval/easylit2

 def search(self):
     inputData = self.getInputData()
     source = inputData.pop('source')
     self.scraper = Scraper(inputData, source)
     logging.info('Scraper search' + str(inputData))
     results = self.scraper.search()
     self.table.fillTable(results)
     if self.scraper.total > 10:
         self.button3.setVisible(True)
     self.update_label()

示例#2

0

显示文件

文件： scrapers_run.py 项目： fedecarles/ASMD

#!/usr/bin/python
# -*- coding: utf-8 -*-

from scrapers import Scraper, removeJunk

sc = Scraper()
sc.scrapeInfobae("http://cdn01.ib.infobae.com/adjuntos/162/rss/politica.xml")
sc.scrapeLaNacion("http://contenidos.lanacion.com.ar/herramientas/rss-categoria_id=30")
sc.scrapeClarin("http://www.clarin.com/rss/politica/")
sc.scrapePagina12("http://www.pagina12.com.ar/diario/rss/ultimas_noticias.xml")
sc.scrapePerfil("http://www.perfil.com/rss/politica.xml")
sc.scrapeInfonews("http://www.infonews.com/rss/politica.xml")
sc.scrapeMendozaOnline("http://www.mdzol.com/files/rss/politica.xml")
sc.scrapeTelam("http://www.telam.com.ar/rss2/politica.xml")
sc.scrapeLosAndes("http://losandes.com.ar/rss/politica")
sc.scrapeLaVoz("http://www.lavoz.com.ar/taxonomy/term/4/1/feed")


removeJunk()

示例#3

0

显示文件

文件： frontend.py 项目： tinval/easylit2

class Window2(QDialog):
    def __init__(self, model, parent=None):
        super(Window2, self).__init__(parent)

        self.model = model
        self.mainLayout = QGridLayout()
        self.mainLayout.setColumnStretch(0, 1)
        self.mainLayout.setColumnStretch(1, 3)
        self.mainLayout.setColumnStretch(2, 3)

        self.flayout = QFormLayout()
        self.flayout.addRow('title', QLineEdit())
        self.flayout.addRow('author', QLineEdit())
        self.flayout.addRow('abstract', QLineEdit())
        self.comboBox = QComboBox()
        self.comboBox.addItems(Scraper.scrapers)
        self.flayout.addRow('source', self.comboBox)

        self.table = TableWidget()
        self.table.doubleClicked.connect(self.openDocument)

        self.setFixedSize(1000, 600)

        button1 = QPushButton()
        button1.setText("Search")
        button1.clicked.connect(self.search)

        button2 = QPushButton()
        button2.setText("Save")
        button2.clicked.connect(self.save)

        self.button3 = QPushButton()
        self.button3.setText("Next")
        self.button3.clicked.connect(self.next_results)
        self.button3.setVisible(False)

        self.button4 = QPushButton()
        self.button4.setText("Previous")
        self.button4.clicked.connect(self.previous_results)
        self.button4.setVisible(False)

        self.label = QLabel()
        self.label.setVisible(False)

        self.setWindowTitle('Search')
        self.mainLayout.addLayout(self.flayout, 0, 0)
        self.mainLayout.addWidget(self.table, 0, 1, 1, 2)
        self.mainLayout.addWidget(self.label, 1, 0)
        self.mainLayout.addWidget(self.button4, 1, 1)
        self.mainLayout.addWidget(self.button3, 1, 2)
        self.mainLayout.addWidget(button1, 2, 0)
        self.mainLayout.addWidget(button2, 2, 1, 1, 2)

        self.setLayout(self.mainLayout)

    def getInputData(self):
        inputData = {
            self.flayout.itemAt(2 * i).widget().text():
            self.flayout.itemAt(2 * i + 1).widget().text()
            for i in range(int(self.flayout.count() / 2) - 1)
        }
        inputData['source'] = self.comboBox.currentText()
        return inputData

    def search(self):
        inputData = self.getInputData()
        source = inputData.pop('source')
        self.scraper = Scraper(inputData, source)
        logging.info('Scraper search' + str(inputData))
        results = self.scraper.search()
        self.table.fillTable(results)
        if self.scraper.total > 10:
            self.button3.setVisible(True)
        self.update_label()

    def update_label(self):
        self.label.setText('Page {} of {}'.format(
            self.scraper.page,
            int(self.scraper.total / 10) + 1))
        self.label.setVisible(True)

    def next_results(self):
        self.scraper.page += 1
        results = self.scraper.search()
        self.table.fillTable(results)
        self.button4.setVisible(True)
        if self.scraper.total <= 10 * self.scraper.page:
            self.button3.setVisible(False)
        self.update_label()

    def previous_results(self):
        self.scraper.page -= 1
        results = self.scraper.search()
        self.table.fillTable(results)
        self.button3.setVisible(True)
        if self.scraper.page <= 1:
            self.button4.setVisible(False)
        self.update_label()

    def openDocument(self, y):
        if 'http' in y.data():
            webbrowser.open(y.data(), new=2)

    def save(self):
        index = self.table.selectionModel().selectedRows()
        if len(index) > 0:
            new_data = {
                self.table.horizontalHeaderItem(i).text():
                str(self.table.model().index(index[0].row(), i).data())
                for i in range(self.table.columnCount())
            }
            if 'document' in new_data and 'pdf' in new_data['document']:
                new_data = self.save_file(new_data)
            row_index = self.model.rowCount(QModelIndex())
            record = self.model.record()
            record.setGenerated('id', False)
            record.setValue('created', QDateTime.currentDateTime())
            for column in new_data:
                record.setValue(column, new_data[column])
            self.model.insertRecord(-1, record)

    def save_file(self, new_data):
        if 'document' in new_data and len(new_data['document']) > 0:
            author = ', '.join(
                re.findall(r'(\w*)(?:$|,)', new_data.get('author'))[:-1])
            title = re.sub(r"[^a-zA-Z0-9]+", ' ', new_data.get('title'))
            date = new_data.get('date') if new_data.get('date') else ''
            filename = date + ' ' + title + ' - ' + author + '.pdf'
            path = os.path.join(cfg['temp'], filename)
            logging.info('Trying to save file ' + filename)
            if not os.path.exists(path):
                response = requests.get(new_data['document'], headers=_HEADERS)
                if response.ok:
                    try:
                        with open(path, 'wb') as f:
                            f.write(response.content)
                        try:
                            new_data['length'] = PdfFileReader(open(
                                path, 'rb')).getNumPages()
                        except:
                            display_text = 'Corrupted document ' + filename
                        new_data['document'] = filename
                        display_text = 'Saved document ' + filename
                    except:
                        display_text = 'Dowload document successful, but not possible to save.'
                        new_data['document'] = ''
                else:
                    display_text = 'Dowload document not successful.'
                    new_data['document'] = ''
            else:
                display_text = 'File ' + filename + 'already exists.'
        else:
            display_text = 'There is no document to save.'
        msgBox = QMessageBox()
        msgBox.setText(display_text)
        msgBox.exec_()
        logging.info(display_text)
        return new_data

示例#4

0

显示文件

文件： __init__.py 项目： ssokolow/fanfic2ebook

def main():
    from optparse import OptionParser, OptionGroup

    descr  = ("A simple tool for archiving fanfiction for offline reading " +
    "and converting said archives into ready-to-read eBooks for pocket " +
    "reading devices.")

    epilog = ("As an alternative to explicitly specifying a personality, " +
    "this command will alter its behaviour if called by the following names:" +
    " " + ', '.join(sorted(Personality.personalities)))

    parser = OptionParser(version="%%prog v%s" % __version__,
        usage="%prog [options] <url> ...", description=descr, epilog=epilog)
    parser.add_option('-b', '--bundle', action="store_true", dest="bundle",
        default=False, help="Also bundle the entire story into a single file" +
                            "with chapter headings and a table of contents.")
    parser.add_option('-t', '--target', action="store", dest="target", metavar="DIR",
        default=os.getcwd(), help="Specify a target directory other than the current working directory.")
    parser.add_option('--list_supported', action="store_true", dest="list_supported",
        default=False, help="List installed scrapers and personalities.")
    parser.add_option('-P', '--personality', action="store", dest="persona", metavar="NAME",
        default=None, help="Set the personality the conversion will operate under. See --list_supported.")

    #pre_group = OptionGroup(parser, "Pre-Processing Options")
    #pre_group.add_option('--strip-accents', action="store_true", dest="strip_accents",
    #    default=False, help="Remove diacritics for compatibility with readers with " +
    #    "limited fonts and no internal fallback mechanism. (eg. Sony PRS-505)")

    pp_group = OptionGroup(parser, "Post-Processing Options")
    pp_group.add_option('-p', '--postproc', action="append", dest="postproc", metavar="CMD",
        default=[], help="Call the specified post-processor after each retrieval " +
                         "completes. Can be used multiple times. Implies --bundle.")
    pp_group.add_option('-e', '--final_ext', action="store", dest="final_ext", metavar="EXT",
        default='.out', help="Set the extension to be used in the output filename " +
                           "available to post-processor templates.")
    parser.add_option_group(pp_group)

    opts, args = parser.parse_args()
    cmd = parser.get_prog_name()

    if opts.list_supported:
        names = sorted(Scraper.scrapers[x].site_name for x in Scraper.scrapers)
        print "Scrapers:\n\t" + '\n\t'.join(names)
        print
        print "Personalities:\n\t" + '\n\t'.join(sorted(Personality.personalities))
        parser.exit()

    if not args:
        parser.print_help()
        parser.exit()

    persona = Personality.get(opts.persona or cmd)()
    for option in persona.opts:
        setattr(opts, option, persona.opts[option])

    if opts.postproc:
        opts.bundle = True

    for url_arg in args:
        scraper = Scraper.get(url_arg)(opts.target, opts.bundle, opts.final_ext)
        try:
            downloaded_story = scraper.download_fic(url_arg)
        except Exception, err:
            print "Failed to retrieve story %s" % url_arg
            print "TODO: Handle this properly"
            continue

        persona.postproc(downloaded_story)

        if opts.postproc:
            inputs = {
                'appname'   : "%s v%s" % (__appname__, __version__),
                'author'    : downloaded_story.author,
                'bundle'    : downloaded_story.path,
                'category'  : downloaded_story.category,
                'coverfile' : downloaded_story.cover,
                'outfile'   : downloaded_story.final_path,
                'site_name' : downloaded_story.site_name,
                'title'     : downloaded_story.title
            }

            for pp_cmdline in opts.postproc:
                cmdlist = pp_cmdline.strip().split()
                print "Calling post-processor: %s" % cmdlist[0]
                subprocess.call([r % inputs for r in cmdlist])