def run(self, _dummy=None): file = self.options['file'] log.info( u"Importing {self.__class__.__name__} from {file.name} into {self.project}" .format(**locals())) from amcat.scraping.controller import RobustController self.controller = RobustController(self.articleset) arts = list(self.controller.scrape(self)) if not arts: raise Exception("No atricles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [ self.articleset.provenance ] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return arts
def run(self, _input): if not self.options['last_date']: self.options['last_date'] = date.today() dedu = self.options['deduplicate'] and True n_days = (self.options['last_date'] - self.options['first_date']).days days = [ self.options['first_date'] + timedelta(days=x) for x in range(n_days + 1) ] scrapers = [self.get_scraper(d) for d in days] RobustController().scrape(scrapers, deduplicate=dedu)
def run(self, _dummy=None): file = self.options['file'] log.info(u"Importing {self.__class__.__name__} from {file.name} into {self.project}" .format(**locals())) from amcat.scraping.controller import RobustController self.controller = RobustController(self.articleset) arts = list(self.controller.scrape(self)) if not arts: raise Exception("No atricles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [self.articleset.provenance] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return arts
def run(self,input=None,deduplicate=False): log.info("Scraping {self.__class__.__name__} into {self.project}, medium {self.medium} using RobustController" .format(**locals())) from amcat.scraping.controller import RobustController return RobustController(self.articleset).scrape([self],deduplicate)
def run(self, _input=None): scraper = self.options["scraper"].get_scraper( date=self.options["date"]) controller = RobustController() controller.scrape(scraper)
class UploadScript(Scraper): """Base class for Upload Scripts, which are scraper scripts driven by the the script input. For legacy reasons, parse_document and split_text may be used instead of the standard get_units and scrape_unit. """ input_type = None output_type = ArticleIterator options_form = UploadForm def get_errors(self): """return a list of document index, message pairs that explains encountered errors""" try: errors = self.controller.errors except AttributeError: log.exception("Cannot get controller errors") return for error in errors: yield self.explain_error(error) def explain_error(self, error): """Explain the error in the context of unit for the end user""" return "Error in element {error.i} : {error.error!r}".format(**locals()) def decode(self, bytes): """Decode the bytes using the encoding from the form""" enc, text = self.bound_form.decode(bytes) return text @property def uploaded_texts(self): """A cached sequence of UploadedFile objects""" try: return self._input_texts except AttributeError: self._input_texts = self.bound_form.get_uploaded_texts() return self._input_texts def get_provenance(self, file, articles): n = len(articles) filename = file.name timestamp = unicode(datetime.datetime.now())[:16] return ("[{timestamp}] Uploaded {n} articles from file {filename!r} " "using {self.__class__.__name__}".format(**locals())) def run(self, _dummy=None): file = self.options['file'] log.info(u"Importing {self.__class__.__name__} from {file.name} into {self.project}" .format(**locals())) from amcat.scraping.controller import RobustController self.controller = RobustController(self.articleset) arts = list(self.controller.scrape(self)) if not arts: raise Exception("No atricles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [self.articleset.provenance] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return arts def postprocess(self, articles): """ Optional postprocessing of articles. Removing aricles from the list will exclude them from the article set (if needed, list should be changed in place) """ pass def _get_units(self): """ Upload form assumes that the form (!) has a get_entries method, which you get if you subclass you form from one of the fileupload forms. If not, please override this method. """ for entry in self.bound_form.get_entries(): for u in self.split_file(entry): yield u def _scrape_unit(self, document): result = self.parse_document(document) if isinstance(result, Article): result = [result] for art in result: yield art def parse_document(self, document): """ Parse the document as one or more articles, provided for legacy purposes @param document: object received from split_text, e.g. a string fragment @return: None, an Article or a sequence of Article(s) """ raise NotImplementedError() def split_file(self, file): """ Split the file into one or more fragments representing individual documents. Default implementation returns a single fragment containing the unicode text. @type text: unicode string @return: a sequence of objects (e.g. strings) to pass to parse_documents """ return [file]
class UploadScript(Scraper): """Base class for Upload Scripts, which are scraper scripts driven by the the script input. For legacy reasons, parse_document and split_text may be used instead of the standard get_units and scrape_unit. """ input_type = None output_type = ArticleIterator options_form = UploadForm def get_errors(self): """return a list of document index, message pairs that explains encountered errors""" try: errors = self.controller.errors except AttributeError: log.exception("Cannot get controller errors") return for error in errors: yield self.explain_error(error) def explain_error(self, error): """Explain the error in the context of unit for the end user""" return "Error in element {error.i} : {error.error!r}".format( **locals()) def decode(self, bytes): """Decode the bytes using the encoding from the form""" enc, text = self.bound_form.decode(bytes) return text @property def uploaded_texts(self): """A cached sequence of UploadedFile objects""" try: return self._input_texts except AttributeError: self._input_texts = self.bound_form.get_uploaded_texts() return self._input_texts def get_provenance(self, file, articles): n = len(articles) filename = file.name timestamp = unicode(datetime.datetime.now())[:16] return ("[{timestamp}] Uploaded {n} articles from file {filename!r} " "using {self.__class__.__name__}".format(**locals())) def run(self, _dummy=None): file = self.options['file'] log.info( u"Importing {self.__class__.__name__} from {file.name} into {self.project}" .format(**locals())) from amcat.scraping.controller import RobustController self.controller = RobustController(self.articleset) arts = list(self.controller.scrape(self)) if not arts: raise Exception("No atricles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [ self.articleset.provenance ] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return arts def postprocess(self, articles): """ Optional postprocessing of articles. Removing aricles from the list will exclude them from the article set (if needed, list should be changed in place) """ pass def _get_units(self): """ Upload form assumes that the form (!) has a get_entries method, which you get if you subclass you form from one of the fileupload forms. If not, please override this method. """ for entry in self.bound_form.get_entries(): for u in self.split_file(entry): yield u def _scrape_unit(self, document): result = self.parse_document(document) if isinstance(result, Article): result = [result] for art in result: yield art def parse_document(self, document): """ Parse the document as one or more articles, provided for legacy purposes @param document: object received from split_text, e.g. a string fragment @return: None, an Article or a sequence of Article(s) """ raise NotImplementedError() def split_file(self, file): """ Split the file into one or more fragments representing individual documents. Default implementation returns a single fragment containing the unicode text. @type text: unicode string @return: a sequence of objects (e.g. strings) to pass to parse_documents """ return [file]
def run(self, _input=None): scraper = self.options["scraper"].get_scraper(date=self.options["date"]) controller = RobustController() controller.scrape(scraper)