Exemplo n.º 1
0
def load_data():
    """ read option line and configuration file, then process data
    import of given section, or all sections if no section is given on
    command line """

    # first parse command line options, and set pgloader.options values
    # accordingly
    conffile, args = parse_options()

    # now init db connection
    config = parse_config(conffile)

    from pgloader.logger  import log
    from pgloader.tools   import read_path, check_path
    from pgloader.options import VERBOSE
    
    import pgloader.options
    if pgloader.options.REFORMAT_PATH:
        rpath  = read_path(pgloader.options.REFORMAT_PATH, log, check = False)
        crpath = check_path(rpath, log)
    else:
        rpath  = crpath  = None

    if not crpath:
        if rpath:
            # don't check same path entries twice
        
            default_rpath = set(crpath) \
                            - set(pgloader.options.DEFAULT_REFORMAT_PATH)
        else:
            default_rpath = pgloader.options.DEFAULT_REFORMAT_PATH
        
        pgloader.options.REFORMAT_PATH = check_path(default_rpath, log)
    else:
        pgloader.options.REFORMAT_PATH = rpath

    log.info('Reformat path is %s', pgloader.options.REFORMAT_PATH)

    # load some pgloader package modules
    from pgloader.options  import VERBOSE, DEBUG, QUIET, SUMMARY
    from pgloader.options  import DRY_RUN, PEDANTIC, VACUUM
    from pgloader.options  import MAX_PARALLEL_SECTIONS
    from pgloader.options  import LOAD_FROM_STDIN, LOAD_TO_TABLE
    from pgloader.options  import FILE_BOUNDARIES
    from pgloader.pgloader import PGLoader
    from pgloader.tools    import PGLoader_Error

    sections = []
    summary  = {}

    # args are meant to be configuration sections, or filenames, or stdin
    if LOAD_FROM_STDIN:
        if FILE_BOUNDARIES is not None:
            log.warning("Can't use --boundaries on stdin")

        if len(args) == 0:
            s = '<stdin>'
            config.add_section(s)
            config.set(s, 'table', LOAD_TO_TABLE)
            config.set(s, 'filename', 'sys.stdin')
            config.set(s, 'columns', '*')
            config.set(s, 'format', 'csv')
            sections.append(s)
            
        elif len(args) == 1:
            if config.has_section(args[0]):
                # apply given section parameters, then load from stdin
                config.set(args[0], 'filename', 'sys.stdin')
                sections.append(args[0])
            else:
                print >>sys.stderr, \
                      "Error: Please provide a [%s] section" % args[0]
                sys.exit(5)
        else:
            print >>sys.stderr, \
                  "Error: can't read several sections all from stdin"
            sys.exit(5)

    elif len(args) > 0:
        for s in args:
            if config.has_section(s):
                sections.append(s)
            else:
                log.info("Creating a section for file '%s'" % s)
                # a filename was given, apply [pgsql] defaults
                # set the tablename as the filename sans extension
                # consider columns = *
                if not os.path.exists(s):
                    print >>sys.stderr, \
                        "Error: '%s' does not exists as a section nor as a file" % s
                    sys.exit(2)

                config.add_section(s)
                config.set(s, 'table', os.path.splitext(os.path.basename(s))[0])
                config.set(s, 'filename', s)
                config.set(s, 'columns', '*')
                config.set(s, 'format', 'csv')
                sections.append(s)

    else:
        if not LOAD_FROM_STDIN:
            # don't load all sections first when asked to load stdin
            log.debug("No argument on CLI, will consider all sections")
            for s in config.sections():
                if s != 'pgsql':
                    sections.append(s)

            # we run through sorted section list, unless we got the section list
            # from command line
            sections.sort()

    if FILE_BOUNDARIES is not None and len(sections) > 1:
        print >>sys.stderr, \
              "Error: will not apply boundaries on more than one file"
        sys.exit(5)

    log.info('Will consider following sections:')
    for line in myprint(sections):
        log.info(line)

    # we count time passed from now on
    begin = time.time()

    threads  = {}
    started  = {}
    finished = {}
    current  = 0

    max_running = MAX_PARALLEL_SECTIONS
    if max_running == -1:
        max_running = len(sections)

    log.info('Will load %d section at a time' % max_running)

    sem = threading.BoundedSemaphore(max_running)
    
    while current < len(sections):
        s = sections[current]

        try:
            loader      = None
            summary [s] = []
            started [s] = threading.Event()
            finished[s] = threading.Event()

            try:
                loader = PGLoader(s, config, sem,
                                  (started[s], finished[s]), summary[s])
            except PGLoader_Error, e:
                # could not initialize properly this loader, don't
                # ever wait for it
                started[s] .set()
                finished[s].set()
                log.error(e)
                if DEBUG:
                    raise

            except IOError, e:
                # No space left on device?  can't log it
                break

            if loader:
                if not loader.template:
                    if FILE_BOUNDARIES is not None and len(sections) == 1:
                        loader.reader.set_boundaries(FILE_BOUNDARIES)
                    filename       = loader.filename
                    input_encoding = loader.input_encoding
                    threads[s]     = loader

                    # .start() will sem.aquire(), so we won't have more
                    # than max_running threads running at any time.
                    log.debug("Starting a thread for %s" % s)
                    threads[s].start()
                else:
                    log.info("Skipping section %s, which is a template" % s)

                    for d in (summary, started, finished):
                        d.pop(s)
Exemplo n.º 2
0
def load_data():
    """ read option line and configuration file, then process data
    import of given section, or all sections if no section is given on
    command line """

    # first parse command line options, and set pgloader.options values
    # accordingly
    conffile, args = parse_options()

    # now init db connection
    config = parse_config(conffile)

    from pgloader.logger import log
    from pgloader.tools import read_path, check_path
    from pgloader.options import VERBOSE

    import pgloader.options
    if pgloader.options.REFORMAT_PATH:
        rpath = read_path(pgloader.options.REFORMAT_PATH, log, check=False)
        crpath = check_path(rpath, log)
    else:
        rpath = crpath = None

    if not crpath:
        if rpath:
            # don't check same path entries twice

            default_rpath = set(crpath) \
                            - set(pgloader.options.DEFAULT_REFORMAT_PATH)
        else:
            default_rpath = pgloader.options.DEFAULT_REFORMAT_PATH

        pgloader.options.REFORMAT_PATH = check_path(default_rpath, log)
    else:
        pgloader.options.REFORMAT_PATH = rpath

    log.info('Reformat path is %s', pgloader.options.REFORMAT_PATH)

    # load some pgloader package modules
    from pgloader.options import VERBOSE, DEBUG, QUIET, SUMMARY
    from pgloader.options import DRY_RUN, PEDANTIC, VACUUM
    from pgloader.options import MAX_PARALLEL_SECTIONS
    from pgloader.options import LOAD_FROM_STDIN, LOAD_TO_TABLE
    from pgloader.options import FILE_BOUNDARIES
    from pgloader.pgloader import PGLoader
    from pgloader.tools import PGLoader_Error

    sections = []
    summary = {}

    # args are meant to be configuration sections, or filenames, or stdin
    if LOAD_FROM_STDIN:
        if FILE_BOUNDARIES is not None:
            log.warning("Can't use --boundaries on stdin")

        if len(args) == 0:
            s = '<stdin>'
            config.add_section(s)
            config.set(s, 'table', LOAD_TO_TABLE)
            config.set(s, 'filename', 'sys.stdin')
            config.set(s, 'columns', '*')
            config.set(s, 'format', 'csv')
            sections.append(s)

        elif len(args) == 1:
            if config.has_section(args[0]):
                # apply given section parameters, then load from stdin
                config.set(args[0], 'filename', 'sys.stdin')
                sections.append(args[0])
            else:
                print >>sys.stderr, \
                      "Error: Please provide a [%s] section" % args[0]
                sys.exit(5)
        else:
            print >>sys.stderr, \
                  "Error: can't read several sections all from stdin"
            sys.exit(5)

    elif len(args) > 0:
        for s in args:
            if config.has_section(s):
                sections.append(s)
            else:
                log.info("Creating a section for file '%s'" % s)
                # a filename was given, apply [pgsql] defaults
                # set the tablename as the filename sans extension
                # consider columns = *
                if not os.path.exists(s):
                    print >>sys.stderr, \
                        "Error: '%s' does not exists as a section nor as a file" % s
                    sys.exit(2)

                config.add_section(s)
                config.set(s, 'table',
                           os.path.splitext(os.path.basename(s))[0])
                config.set(s, 'filename', s)
                config.set(s, 'columns', '*')
                config.set(s, 'format', 'csv')
                sections.append(s)

    else:
        if not LOAD_FROM_STDIN:
            # don't load all sections first when asked to load stdin
            log.debug("No argument on CLI, will consider all sections")
            for s in config.sections():
                if s != 'pgsql':
                    sections.append(s)

            # we run through sorted section list, unless we got the section list
            # from command line
            sections.sort()

    if FILE_BOUNDARIES is not None and len(sections) > 1:
        print >>sys.stderr, \
              "Error: will not apply boundaries on more than one file"
        sys.exit(5)

    log.info('Will consider following sections:')
    for line in myprint(sections):
        log.info(line)

    # we count time passed from now on
    begin = time.time()

    threads = {}
    started = {}
    finished = {}
    current = 0

    max_running = MAX_PARALLEL_SECTIONS
    if max_running == -1:
        max_running = len(sections)

    log.info('Will load %d section at a time' % max_running)

    sem = threading.BoundedSemaphore(max_running)

    while current < len(sections):
        s = sections[current]

        try:
            loader = None
            summary[s] = []
            started[s] = threading.Event()
            finished[s] = threading.Event()

            try:
                loader = PGLoader(s, config, sem, (started[s], finished[s]),
                                  summary[s])
            except PGLoader_Error, e:
                # could not initialize properly this loader, don't
                # ever wait for it
                started[s].set()
                finished[s].set()
                log.error(e)
                if DEBUG:
                    raise

            except IOError, e:
                # No space left on device?  can't log it
                break

            if loader:
                if not loader.template:
                    if FILE_BOUNDARIES is not None and len(sections) == 1:
                        loader.reader.set_boundaries(FILE_BOUNDARIES)
                    filename = loader.filename
                    input_encoding = loader.input_encoding
                    threads[s] = loader

                    # .start() will sem.aquire(), so we won't have more
                    # than max_running threads running at any time.
                    log.debug("Starting a thread for %s" % s)
                    threads[s].start()
                else:
                    log.info("Skipping section %s, which is a template" % s)

                    for d in (summary, started, finished):
                        d.pop(s)
Exemplo n.º 3
0
                    input_encoding = loader.input_encoding
                    threads[s]     = loader

                    # .start() will sem.aquire(), so we won't have more
                    # than max_running threads running at any time.
                    log.debug("Starting a thread for %s" % s)
                    threads[s].start()
                else:
                    log.info("Skipping section %s, which is a template" % s)

                    for d in (summary, started, finished):
                        d.pop(s)

        except PGLoader_Error, e:
            if e == '':
                log.error('[%s] Please correct previous errors' % s)
            else:
                log.error('%s' % e)

            if DEBUG:
                raise

            if PEDANTIC:
                # was: threads[s].print_stats()
                # but now thread[s] is no more alive
                pass

        except UnicodeDecodeError, e:
            log.error("can't open '%s' with given input encoding '%s'" \
                               % (filename, input_encoding))
                                    
Exemplo n.º 4
0
                    input_encoding = loader.input_encoding
                    threads[s] = loader

                    # .start() will sem.aquire(), so we won't have more
                    # than max_running threads running at any time.
                    log.debug("Starting a thread for %s" % s)
                    threads[s].start()
                else:
                    log.info("Skipping section %s, which is a template" % s)

                    for d in (summary, started, finished):
                        d.pop(s)

        except PGLoader_Error, e:
            if e == '':
                log.error('[%s] Please correct previous errors' % s)
            else:
                log.error('%s' % e)

            if DEBUG:
                raise

            if PEDANTIC:
                # was: threads[s].print_stats()
                # but now thread[s] is no more alive
                pass

        except UnicodeDecodeError, e:
            log.error("can't open '%s' with given input encoding '%s'" \
                               % (filename, input_encoding))