示例#1
0
    def run(self, _dummy=None):
        file = self.options['file']
        log.info(
            u"Importing {self.__class__.__name__} from {file.name} into {self.project}"
            .format(**locals()))
        from amcat.scraping.controller import RobustController
        self.controller = RobustController(self.articleset)

        arts = list(self.controller.scrape(self))
        if not arts:
            raise Exception("No atricles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [
            self.articleset.provenance
        ]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] +
                                               old_provenance)
        self.articleset.save()

        return arts
示例#2
0
    def run(self, _input):
        if not self.options['last_date']:
            self.options['last_date'] = date.today()

        dedu = self.options['deduplicate'] and True

        n_days = (self.options['last_date'] - self.options['first_date']).days
        days = [
            self.options['first_date'] + timedelta(days=x)
            for x in range(n_days + 1)
        ]
        scrapers = [self.get_scraper(d) for d in days]
        RobustController().scrape(scrapers, deduplicate=dedu)
示例#3
0
    def run(self, _dummy=None):
        file = self.options['file']
        log.info(u"Importing {self.__class__.__name__} from {file.name} into {self.project}"
                 .format(**locals()))
        from amcat.scraping.controller import RobustController
        self.controller = RobustController(self.articleset)

        arts = list(self.controller.scrape(self))
        if not arts:
            raise Exception("No atricles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [self.articleset.provenance]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] + old_provenance)
        self.articleset.save()

        return arts
示例#4
0
 def run(self,input=None,deduplicate=False):
     log.info("Scraping {self.__class__.__name__} into {self.project}, medium {self.medium} using RobustController"
              .format(**locals()))
     from amcat.scraping.controller import RobustController
     return RobustController(self.articleset).scrape([self],deduplicate)
示例#5
0
 def run(self, _input=None):
     scraper = self.options["scraper"].get_scraper(
         date=self.options["date"])
     controller = RobustController()
     controller.scrape(scraper)
示例#6
0
class UploadScript(Scraper):
    """Base class for Upload Scripts, which are scraper scripts driven by the
    the script input.

    For legacy reasons, parse_document and split_text may be used instead of the standard
    get_units and scrape_unit.
    """
    
    input_type = None
    output_type = ArticleIterator
    options_form = UploadForm

    def get_errors(self):
        """return a list of document index, message pairs that explains encountered errors"""
        try:
            errors = self.controller.errors
        except AttributeError:
            log.exception("Cannot get controller errors")
            return 

        for error in errors:
            yield self.explain_error(error)

    def explain_error(self, error):
        """Explain the error in the context of unit for the end user"""
        return "Error in element {error.i} : {error.error!r}".format(**locals())
            

    def decode(self, bytes):
        """Decode the bytes using the encoding from the form"""
        enc, text = self.bound_form.decode(bytes)
        return text
    
    @property
    def uploaded_texts(self):
        """A cached sequence of UploadedFile objects"""
        try:
            return self._input_texts
        except AttributeError:
            self._input_texts = self.bound_form.get_uploaded_texts()
            return self._input_texts

    def get_provenance(self, file, articles):
        n = len(articles)
        filename = file.name
        timestamp = unicode(datetime.datetime.now())[:16]
        return ("[{timestamp}] Uploaded {n} articles from file {filename!r} "
                "using {self.__class__.__name__}".format(**locals()))
        
    def run(self, _dummy=None):
        file = self.options['file']
        log.info(u"Importing {self.__class__.__name__} from {file.name} into {self.project}"
                 .format(**locals()))
        from amcat.scraping.controller import RobustController
        self.controller = RobustController(self.articleset)

        arts = list(self.controller.scrape(self))
        if not arts:
            raise Exception("No atricles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [self.articleset.provenance]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] + old_provenance)
        self.articleset.save()

        return arts

    def postprocess(self, articles):
        """
        Optional postprocessing of articles. Removing aricles from the list will exclude them from the
        article set (if needed, list should be changed in place)
        """
        pass
    
    def _get_units(self):
        """
        Upload form assumes that the form (!) has a get_entries method, which you get
        if you subclass you form from one of the fileupload forms. If not, please override
        this method. 
        """
        for entry in self.bound_form.get_entries():
            for u in self.split_file(entry):
                yield u
    
    def _scrape_unit(self, document):
        result =  self.parse_document(document)
        if isinstance(result, Article):
            result = [result]
        for art in result:
            yield art
        
    def parse_document(self, document):
        """
        Parse the document as one or more articles, provided for legacy purposes

        @param document: object received from split_text, e.g. a string fragment
        @return: None, an Article or a sequence of Article(s)
        """
        raise NotImplementedError()

    def split_file(self, file):
        """
        Split the file into one or more fragments representing individual documents.
        Default implementation returns a single fragment containing the unicode text.

        @type text: unicode string
        @return: a sequence of objects (e.g. strings) to pass to parse_documents
        """
        return [file]
示例#7
0
class UploadScript(Scraper):
    """Base class for Upload Scripts, which are scraper scripts driven by the
    the script input.

    For legacy reasons, parse_document and split_text may be used instead of the standard
    get_units and scrape_unit.
    """

    input_type = None
    output_type = ArticleIterator
    options_form = UploadForm

    def get_errors(self):
        """return a list of document index, message pairs that explains encountered errors"""
        try:
            errors = self.controller.errors
        except AttributeError:
            log.exception("Cannot get controller errors")
            return

        for error in errors:
            yield self.explain_error(error)

    def explain_error(self, error):
        """Explain the error in the context of unit for the end user"""
        return "Error in element {error.i} : {error.error!r}".format(
            **locals())

    def decode(self, bytes):
        """Decode the bytes using the encoding from the form"""
        enc, text = self.bound_form.decode(bytes)
        return text

    @property
    def uploaded_texts(self):
        """A cached sequence of UploadedFile objects"""
        try:
            return self._input_texts
        except AttributeError:
            self._input_texts = self.bound_form.get_uploaded_texts()
            return self._input_texts

    def get_provenance(self, file, articles):
        n = len(articles)
        filename = file.name
        timestamp = unicode(datetime.datetime.now())[:16]
        return ("[{timestamp}] Uploaded {n} articles from file {filename!r} "
                "using {self.__class__.__name__}".format(**locals()))

    def run(self, _dummy=None):
        file = self.options['file']
        log.info(
            u"Importing {self.__class__.__name__} from {file.name} into {self.project}"
            .format(**locals()))
        from amcat.scraping.controller import RobustController
        self.controller = RobustController(self.articleset)

        arts = list(self.controller.scrape(self))
        if not arts:
            raise Exception("No atricles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [
            self.articleset.provenance
        ]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] +
                                               old_provenance)
        self.articleset.save()

        return arts

    def postprocess(self, articles):
        """
        Optional postprocessing of articles. Removing aricles from the list will exclude them from the
        article set (if needed, list should be changed in place)
        """
        pass

    def _get_units(self):
        """
        Upload form assumes that the form (!) has a get_entries method, which you get
        if you subclass you form from one of the fileupload forms. If not, please override
        this method. 
        """
        for entry in self.bound_form.get_entries():
            for u in self.split_file(entry):
                yield u

    def _scrape_unit(self, document):
        result = self.parse_document(document)
        if isinstance(result, Article):
            result = [result]
        for art in result:
            yield art

    def parse_document(self, document):
        """
        Parse the document as one or more articles, provided for legacy purposes

        @param document: object received from split_text, e.g. a string fragment
        @return: None, an Article or a sequence of Article(s)
        """
        raise NotImplementedError()

    def split_file(self, file):
        """
        Split the file into one or more fragments representing individual documents.
        Default implementation returns a single fragment containing the unicode text.

        @type text: unicode string
        @return: a sequence of objects (e.g. strings) to pass to parse_documents
        """
        return [file]
示例#8
0
 def run(self, _input=None):
     scraper = self.options["scraper"].get_scraper(date=self.options["date"])
     controller = RobustController()
     controller.scrape(scraper)