def search(self): inputData = self.getInputData() source = inputData.pop('source') self.scraper = Scraper(inputData, source) logging.info('Scraper search' + str(inputData)) results = self.scraper.search() self.table.fillTable(results) if self.scraper.total > 10: self.button3.setVisible(True) self.update_label()
#!/usr/bin/python # -*- coding: utf-8 -*- from scrapers import Scraper, removeJunk sc = Scraper() sc.scrapeInfobae("http://cdn01.ib.infobae.com/adjuntos/162/rss/politica.xml") sc.scrapeLaNacion("http://contenidos.lanacion.com.ar/herramientas/rss-categoria_id=30") sc.scrapeClarin("http://www.clarin.com/rss/politica/") sc.scrapePagina12("http://www.pagina12.com.ar/diario/rss/ultimas_noticias.xml") sc.scrapePerfil("http://www.perfil.com/rss/politica.xml") sc.scrapeInfonews("http://www.infonews.com/rss/politica.xml") sc.scrapeMendozaOnline("http://www.mdzol.com/files/rss/politica.xml") sc.scrapeTelam("http://www.telam.com.ar/rss2/politica.xml") sc.scrapeLosAndes("http://losandes.com.ar/rss/politica") sc.scrapeLaVoz("http://www.lavoz.com.ar/taxonomy/term/4/1/feed") removeJunk()
class Window2(QDialog): def __init__(self, model, parent=None): super(Window2, self).__init__(parent) self.model = model self.mainLayout = QGridLayout() self.mainLayout.setColumnStretch(0, 1) self.mainLayout.setColumnStretch(1, 3) self.mainLayout.setColumnStretch(2, 3) self.flayout = QFormLayout() self.flayout.addRow('title', QLineEdit()) self.flayout.addRow('author', QLineEdit()) self.flayout.addRow('abstract', QLineEdit()) self.comboBox = QComboBox() self.comboBox.addItems(Scraper.scrapers) self.flayout.addRow('source', self.comboBox) self.table = TableWidget() self.table.doubleClicked.connect(self.openDocument) self.setFixedSize(1000, 600) button1 = QPushButton() button1.setText("Search") button1.clicked.connect(self.search) button2 = QPushButton() button2.setText("Save") button2.clicked.connect(self.save) self.button3 = QPushButton() self.button3.setText("Next") self.button3.clicked.connect(self.next_results) self.button3.setVisible(False) self.button4 = QPushButton() self.button4.setText("Previous") self.button4.clicked.connect(self.previous_results) self.button4.setVisible(False) self.label = QLabel() self.label.setVisible(False) self.setWindowTitle('Search') self.mainLayout.addLayout(self.flayout, 0, 0) self.mainLayout.addWidget(self.table, 0, 1, 1, 2) self.mainLayout.addWidget(self.label, 1, 0) self.mainLayout.addWidget(self.button4, 1, 1) self.mainLayout.addWidget(self.button3, 1, 2) self.mainLayout.addWidget(button1, 2, 0) self.mainLayout.addWidget(button2, 2, 1, 1, 2) self.setLayout(self.mainLayout) def getInputData(self): inputData = { self.flayout.itemAt(2 * i).widget().text(): self.flayout.itemAt(2 * i + 1).widget().text() for i in range(int(self.flayout.count() / 2) - 1) } inputData['source'] = self.comboBox.currentText() return inputData def search(self): inputData = self.getInputData() source = inputData.pop('source') self.scraper = Scraper(inputData, source) logging.info('Scraper search' + str(inputData)) results = self.scraper.search() self.table.fillTable(results) if self.scraper.total > 10: self.button3.setVisible(True) self.update_label() def update_label(self): self.label.setText('Page {} of {}'.format( self.scraper.page, int(self.scraper.total / 10) + 1)) self.label.setVisible(True) def next_results(self): self.scraper.page += 1 results = self.scraper.search() self.table.fillTable(results) self.button4.setVisible(True) if self.scraper.total <= 10 * self.scraper.page: self.button3.setVisible(False) self.update_label() def previous_results(self): self.scraper.page -= 1 results = self.scraper.search() self.table.fillTable(results) self.button3.setVisible(True) if self.scraper.page <= 1: self.button4.setVisible(False) self.update_label() def openDocument(self, y): if 'http' in y.data(): webbrowser.open(y.data(), new=2) def save(self): index = self.table.selectionModel().selectedRows() if len(index) > 0: new_data = { self.table.horizontalHeaderItem(i).text(): str(self.table.model().index(index[0].row(), i).data()) for i in range(self.table.columnCount()) } if 'document' in new_data and 'pdf' in new_data['document']: new_data = self.save_file(new_data) row_index = self.model.rowCount(QModelIndex()) record = self.model.record() record.setGenerated('id', False) record.setValue('created', QDateTime.currentDateTime()) for column in new_data: record.setValue(column, new_data[column]) self.model.insertRecord(-1, record) def save_file(self, new_data): if 'document' in new_data and len(new_data['document']) > 0: author = ', '.join( re.findall(r'(\w*)(?:$|,)', new_data.get('author'))[:-1]) title = re.sub(r"[^a-zA-Z0-9]+", ' ', new_data.get('title')) date = new_data.get('date') if new_data.get('date') else '' filename = date + ' ' + title + ' - ' + author + '.pdf' path = os.path.join(cfg['temp'], filename) logging.info('Trying to save file ' + filename) if not os.path.exists(path): response = requests.get(new_data['document'], headers=_HEADERS) if response.ok: try: with open(path, 'wb') as f: f.write(response.content) try: new_data['length'] = PdfFileReader(open( path, 'rb')).getNumPages() except: display_text = 'Corrupted document ' + filename new_data['document'] = filename display_text = 'Saved document ' + filename except: display_text = 'Dowload document successful, but not possible to save.' new_data['document'] = '' else: display_text = 'Dowload document not successful.' new_data['document'] = '' else: display_text = 'File ' + filename + 'already exists.' else: display_text = 'There is no document to save.' msgBox = QMessageBox() msgBox.setText(display_text) msgBox.exec_() logging.info(display_text) return new_data
def main(): from optparse import OptionParser, OptionGroup descr = ("A simple tool for archiving fanfiction for offline reading " + "and converting said archives into ready-to-read eBooks for pocket " + "reading devices.") epilog = ("As an alternative to explicitly specifying a personality, " + "this command will alter its behaviour if called by the following names:" + " " + ', '.join(sorted(Personality.personalities))) parser = OptionParser(version="%%prog v%s" % __version__, usage="%prog [options] <url> ...", description=descr, epilog=epilog) parser.add_option('-b', '--bundle', action="store_true", dest="bundle", default=False, help="Also bundle the entire story into a single file" + "with chapter headings and a table of contents.") parser.add_option('-t', '--target', action="store", dest="target", metavar="DIR", default=os.getcwd(), help="Specify a target directory other than the current working directory.") parser.add_option('--list_supported', action="store_true", dest="list_supported", default=False, help="List installed scrapers and personalities.") parser.add_option('-P', '--personality', action="store", dest="persona", metavar="NAME", default=None, help="Set the personality the conversion will operate under. See --list_supported.") #pre_group = OptionGroup(parser, "Pre-Processing Options") #pre_group.add_option('--strip-accents', action="store_true", dest="strip_accents", # default=False, help="Remove diacritics for compatibility with readers with " + # "limited fonts and no internal fallback mechanism. (eg. Sony PRS-505)") pp_group = OptionGroup(parser, "Post-Processing Options") pp_group.add_option('-p', '--postproc', action="append", dest="postproc", metavar="CMD", default=[], help="Call the specified post-processor after each retrieval " + "completes. Can be used multiple times. Implies --bundle.") pp_group.add_option('-e', '--final_ext', action="store", dest="final_ext", metavar="EXT", default='.out', help="Set the extension to be used in the output filename " + "available to post-processor templates.") parser.add_option_group(pp_group) opts, args = parser.parse_args() cmd = parser.get_prog_name() if opts.list_supported: names = sorted(Scraper.scrapers[x].site_name for x in Scraper.scrapers) print "Scrapers:\n\t" + '\n\t'.join(names) print print "Personalities:\n\t" + '\n\t'.join(sorted(Personality.personalities)) parser.exit() if not args: parser.print_help() parser.exit() persona = Personality.get(opts.persona or cmd)() for option in persona.opts: setattr(opts, option, persona.opts[option]) if opts.postproc: opts.bundle = True for url_arg in args: scraper = Scraper.get(url_arg)(opts.target, opts.bundle, opts.final_ext) try: downloaded_story = scraper.download_fic(url_arg) except Exception, err: print "Failed to retrieve story %s" % url_arg print "TODO: Handle this properly" continue persona.postproc(downloaded_story) if opts.postproc: inputs = { 'appname' : "%s v%s" % (__appname__, __version__), 'author' : downloaded_story.author, 'bundle' : downloaded_story.path, 'category' : downloaded_story.category, 'coverfile' : downloaded_story.cover, 'outfile' : downloaded_story.final_path, 'site_name' : downloaded_story.site_name, 'title' : downloaded_story.title } for pp_cmdline in opts.postproc: cmdlist = pp_cmdline.strip().split() print "Calling post-processor: %s" % cmdlist[0] subprocess.call([r % inputs for r in cmdlist])