예제 #1
0
def main(argv=None):
    """ Guido van Rossum's pattern for a Python main function """

    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, _ = getopt.getopt(
                argv[1:],
                "hirbl:u:d:n:",
                [
                    "help", "init", "reparse", "debug", "limit=", "urls=",
                    "uuid=", "numprocs="
                ],
            )
        except getopt.error as msg:
            raise Usage(msg)
        init = False
        # !!! DEBUG default limit on number of articles to parse, unless otherwise specified
        limit = 10
        reparse = False
        urls = None
        uuid = None
        numprocs = None
        debug = False

        def parse_int(i):
            try:
                return int(a)
            except ValueError:
                return None

        # Process options
        for o, a in opts:
            if o in ("-h", "--help"):
                print(__doc__)
                sys.exit(0)
            elif o in ("-i", "--init"):
                # Initialize database (without overwriting existing data)
                init = True
            elif o in ("-b", "--debug"):
                # Run in debug mode
                debug = True
            elif o in ("-r", "--reparse"):
                # Reparse already parsed articles, oldest first
                reparse = True
            elif o in ("-l", "--limit"):
                # Maximum number of articles to parse
                limit = parse_int(a)
            elif o in ("-u", "--urls"):
                # Text file with list of URLs
                urls = a
            elif o in ("-d", "--uuid"):
                # UUID of a single article to reparse
                uuid = a
            elif o in ("-n", "--numprocs"):
                # Max number of processes to fork when parsing
                # (default: use all CPU cores)
                numprocs = parse_int(a)

        # Set logging format
        logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s",
                            level=logging.INFO)

        # Read the configuration settings file
        try:
            Settings.read("config/Greynir.conf")
            # Don't run the scraper in debug mode unless --debug is specified
            Settings.DEBUG = debug
        except ConfigError as e:
            print("Configuration error: {0}".format(e), file=sys.stderr)
            return 2

        if init:
            # Initialize the scraper database
            init_roots()
        else:
            # Run the scraper
            scrape_articles(reparse=reparse,
                            limit=limit,
                            urls=urls,
                            uuid=uuid,
                            numprocs=numprocs)

    except Usage as err:
        print(err.msg, file=sys.stderr)
        print("For help use --help", file=sys.stderr)
        return 2

    finally:
        SessionContext.cleanup()
        Article.cleanup()

    # Completed with no error
    return 0
예제 #2
0
def main(argv=None):
    """ Guido van Rossum's pattern for a Python main function """

    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(
                argv[1:],
                "hirl:u:d:",
                ["help", "init", "reparse", "limit=", "urls=", "uuid="],
            )
        except getopt.error as msg:
            raise Usage(msg)
        init = False
        # !!! DEBUG default limit on number of articles to parse, unless otherwise specified
        limit = 10
        reparse = False
        urls = None
        uuid = None

        # Process options
        for o, a in opts:
            if o in ("-h", "--help"):
                print(__doc__)
                sys.exit(0)
            elif o in ("-i", "--init"):
                init = True
            elif o in ("-r", "--reparse"):
                reparse = True
            elif o in ("-l", "--limit"):
                # Maximum number of articles to parse
                try:
                    limit = int(a)
                except ValueError:
                    pass
            elif o in ("-u", "--urls"):
                urls = a  # Text file with list of URLs
            elif o in ("-d", "--uuid"):
                uuid = a  # UUID of article to reparse

        # Process arguments
        for _ in args:
            pass

        # Set logging format
        logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s",
                            level=logging.INFO)

        # Read the configuration settings file
        try:
            Settings.read("config/Greynir.conf")
            # Don't run the scraper in debug mode
            Settings.DEBUG = False
        except ConfigError as e:
            print("Configuration error: {0}".format(e), file=sys.stderr)
            return 2

        if init:
            # Initialize the scraper database
            init_roots()
        else:
            # Run the scraper
            scrape_articles(reparse=reparse, limit=limit, urls=urls, uuid=uuid)

    except Usage as err:
        print(err.msg, file=sys.stderr)
        print("For help use --help", file=sys.stderr)
        return 2

    finally:
        SessionContext.cleanup()
        Article.cleanup()

    # Completed with no error
    return 0
예제 #3
0
def main(argv=None):
    """ Guido van Rossum's pattern for a Python main function """

    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(
                argv[1:], "hirl:u:", ["help", "init", "reparse", "limit=", "urls="]
            )
        except getopt.error as msg:
            raise Usage(msg)
        init = False
        # !!! DEBUG default limit on number of articles to parse, unless otherwise specified
        limit = 10
        reparse = False
        urls = None

        # Process options
        for o, a in opts:
            if o in ("-h", "--help"):
                print(__doc__)
                sys.exit(0)
            elif o in ("-i", "--init"):
                init = True
            elif o in ("-r", "--reparse"):
                reparse = True
            elif o in ("-l", "--limit"):
                # Maximum number of articles to parse
                try:
                    limit = int(a)
                except ValueError:
                    pass
            elif o in ("-u", "--urls"):
                urls = a  # Text file with list of URLs

        # Process arguments
        for _ in args:
            pass

        # Set logging format
        logging.basicConfig(
            format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO
        )

        # Read the configuration settings file
        try:
            Settings.read("config/Reynir.conf")
            # Don't run the scraper in debug mode
            Settings.DEBUG = False
        except ConfigError as e:
            print("Configuration error: {0}".format(e), file=sys.stderr)
            return 2

        if init:
            # Initialize the scraper database
            init_roots()
        else:
            # Run the scraper
            scrape_articles(reparse=reparse, limit=limit, urls=urls)

    except Usage as err:
        print(err.msg, file=sys.stderr)
        print("For help use --help", file=sys.stderr)
        return 2

    finally:
        SessionContext.cleanup()
        Article.cleanup()

    # Completed with no error
    return 0