예제 #1
0
def __zookeeper_path(dist_directory: str = MC_DIST_DIR,
                     zookeeper_version: str = MC_ZOOKEEPER_VERSION) -> str:
    """Return path to where ZooKeeper distribution should be located."""
    dist_path = resolve_absolute_path_under_mc_root(path=dist_directory)
    zookeeper_directory = "zookeeper-%s" % zookeeper_version
    solr_path = os.path.join(dist_path, zookeeper_directory)
    return solr_path
예제 #2
0
def run_solr_standalone(
    hostname: str = fqdn(),
    port: int = MC_SOLR_STANDALONE_PORT,
    base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
    dist_directory: str = MC_DIST_DIR,
    solr_version: str = MC_SOLR_VERSION,
    jvm_heap_size: str = MC_SOLR_STANDALONE_JVM_HEAP_SIZE,
):
    """Run standalone instance of Solr."""
    if not __solr_is_installed(dist_directory=dist_directory, solr_version=solr_version):
        l.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory, solr_version=solr_version)

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)

    if tcp_port_is_open(port=port):
        raise Exception("Port %d is already open on this machine." % port)

    l.info("Starting standalone Solr instance on port %d..." % port)
    __run_solr(
        hostname=hostname,
        port=port,
        instance_data_dir=standalone_data_dir,
        jvm_heap_size=jvm_heap_size,
        jvm_opts=MC_SOLR_STANDALONE_JVM_OPTS,
        connect_timeout=MC_SOLR_STANDALONE_CONNECT_RETRIES,
        dist_directory=dist_directory,
        solr_version=solr_version,
    )
예제 #3
0
파일: solr.py 프로젝트: oakieoak/mediacloud
def run_solr_standalone(hostname: str = fqdn(),
                        port: int = MC_SOLR_STANDALONE_PORT,
                        base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                        dist_directory: str = MC_DIST_DIR,
                        solr_version: str = MC_SOLR_VERSION,
                        jvm_heap_size: str = MC_SOLR_STANDALONE_JVM_HEAP_SIZE):
    """Run standalone instance of Solr."""
    if not __solr_is_installed(dist_directory=dist_directory,
                               solr_version=solr_version):
        l.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory,
                       solr_version=solr_version)

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir,
                                                        must_exist=True)
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)

    if tcp_port_is_open(port=port):
        raise McSolrRunException("Port %d is already open on this machine." %
                                 port)

    l.info("Starting standalone Solr instance on port %d..." % port)
    __run_solr(hostname=hostname,
               port=port,
               instance_data_dir=standalone_data_dir,
               jvm_heap_size=jvm_heap_size,
               jvm_opts=MC_SOLR_STANDALONE_JVM_OPTS,
               connect_timeout=MC_SOLR_STANDALONE_CONNECT_RETRIES,
               dist_directory=dist_directory,
               solr_version=solr_version)
예제 #4
0
def test_resolve_absolute_path_under_mc_root():
    path = mc_paths.resolve_absolute_path_under_mc_root(path='.',
                                                        must_exist=True)
    assert len(path) > 0

    # Path that exists
    path = mc_paths.resolve_absolute_path_under_mc_root(path='mediawords.yml',
                                                        must_exist=True)
    assert len(path) > 0
    assert os.path.isfile(path) is True

    # Path that does not exist
    path = mc_paths.resolve_absolute_path_under_mc_root(
        path='TOTALLY_DOES_NOT_EXIST', must_exist=False)
    assert len(path) > 0
    assert os.path.isfile(path) is False
예제 #5
0
def __zookeeper_path(dist_directory: str = MC_DIST_DIR,
                     zookeeper_version: str = MC_ZOOKEEPER_VERSION) -> str:
    """Return path to where ZooKeeper distribution should be located."""
    dist_path = resolve_absolute_path_under_mc_root(path=dist_directory)
    zookeeper_directory = "zookeeper-%s" % zookeeper_version
    solr_path = os.path.join(dist_path, zookeeper_directory)
    return solr_path
예제 #6
0
파일: solr.py 프로젝트: oakieoak/mediacloud
def __solr_path(dist_directory: str = MC_DIST_DIR,
                solr_version: str = MC_SOLR_VERSION) -> str:
    """Return path to where Solr distribution should be located."""
    dist_path = resolve_absolute_path_under_mc_root(path=dist_directory,
                                                    must_exist=True)
    solr_directory = "solr-%s" % solr_version
    solr_path = os.path.join(dist_path, solr_directory)
    return solr_path
예제 #7
0
파일: solr.py 프로젝트: nzufelt/mediacloud
def run_solr_shard(shard_num: int,
                   shard_count: int,
                   hostname: str = None,
                   starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT,
                   base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                   dist_directory: str = MC_DIST_DIR,
                   solr_version: str = MC_SOLR_VERSION,
                   zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
                   zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
                   jvm_heap_size: str = MC_SOLR_CLUSTER_JVM_HEAP_SIZE) -> None:
    """Run Solr shard, install Solr if needed; read configuration from ZooKeeper."""
    if shard_num < 1:
        raise McSolrRunException("Shard number must be 1 or greater.")
    if shard_count < 1:
        raise McSolrRunException("Shard count must be 1 or greater.")

    if not __solr_is_installed(dist_directory=dist_directory,
                               solr_version=solr_version):
        log.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory,
                       solr_version=solr_version)

    if hostname is None:
        hostname = fqdn()

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir,
                                                        must_exist=True)

    shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port)
    shard_data_dir = __shard_data_dir(shard_num=shard_num,
                                      base_data_dir=base_data_dir)

    log.info("Waiting for ZooKeeper to start on %s:%d..." %
             (zookeeper_host, zookeeper_port))
    wait_for_tcp_port_to_open(
        hostname=zookeeper_host,
        port=zookeeper_port,
        retries=MC_SOLR_CLUSTER_ZOOKEEPER_CONNECT_RETRIES)
    log.info("ZooKeeper is up!")

    log.info("Starting Solr shard %d on port %d..." % (shard_num, shard_port))
    # noinspection SpellCheckingInspection
    shard_args = [
        "-DzkHost=%s:%d" % (zookeeper_host, zookeeper_port),
        "-DnumShards=%d" % shard_count,
    ]
    __run_solr(hostname=hostname,
               port=shard_port,
               instance_data_dir=shard_data_dir,
               jvm_heap_size=jvm_heap_size,
               jvm_opts=MC_SOLR_CLUSTER_JVM_OPTS,
               start_jar_args=shard_args,
               connect_timeout=MC_SOLR_CLUSTER_CONNECT_RETRIES,
               dist_directory=dist_directory,
               solr_version=solr_version)
예제 #8
0
def run_solr_shard(shard_num: int,
                   shard_count: int,
                   hostname: str = None,
                   starting_port: int = MC_SOLR_CLUSTER_STARTING_PORT,
                   base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                   dist_directory: str = MC_DIST_DIR,
                   solr_version: str = MC_SOLR_VERSION,
                   zookeeper_host: str = MC_SOLR_CLUSTER_ZOOKEEPER_HOST,
                   zookeeper_port: int = MC_SOLR_CLUSTER_ZOOKEEPER_PORT,
                   jvm_heap_size: str = MC_SOLR_CLUSTER_JVM_HEAP_SIZE) -> None:
    """Run Solr shard, install Solr if needed; read configuration from ZooKeeper."""
    if shard_num < 1:
        raise McSolrRunException("Shard number must be 1 or greater.")
    if shard_count < 1:
        raise McSolrRunException("Shard count must be 1 or greater.")

    if not __solr_is_installed(dist_directory=dist_directory, solr_version=solr_version):
        log.info("Solr is not installed, installing...")
        __install_solr(dist_directory=dist_directory, solr_version=solr_version)

    if hostname is None:
        hostname = fqdn()

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    shard_port = __shard_port(shard_num=shard_num, starting_port=starting_port)
    shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir)

    log.info("Waiting for ZooKeeper to start on %s:%d..." % (zookeeper_host, zookeeper_port))
    wait_for_tcp_port_to_open(hostname=zookeeper_host,
                              port=zookeeper_port,
                              retries=MC_SOLR_CLUSTER_ZOOKEEPER_CONNECT_RETRIES)
    log.info("ZooKeeper is up!")

    log.info("Starting Solr shard %d on port %d..." % (shard_num, shard_port))
    # noinspection SpellCheckingInspection
    shard_args = [
        "-DzkHost=%s:%d" % (zookeeper_host, zookeeper_port),
        "-DnumShards=%d" % shard_count,
    ]
    __run_solr(hostname=hostname,
               port=shard_port,
               instance_data_dir=shard_data_dir,
               jvm_heap_size=jvm_heap_size,
               jvm_opts=MC_SOLR_CLUSTER_JVM_OPTS,
               start_jar_args=shard_args,
               connect_timeout=MC_SOLR_CLUSTER_CONNECT_RETRIES,
               dist_directory=dist_directory,
               solr_version=solr_version)
예제 #9
0
def upgrade_lucene_standalone_index(base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                                    dist_directory: str = MC_DIST_DIR,
                                    solr_version: str = MC_SOLR_VERSION):
    """Upgrade Lucene index using the IndexUpgrader tool to standalone instance."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    log.info("Making sure standalone instance isn't running...")
    port = MC_SOLR_STANDALONE_PORT
    if tcp_port_is_open(port=port):
        raise McSolrRunException("Solr standalone instance is running on port %d." % port)
    log.info("Made sure standalone instance isn't running.")

    log.info("Upgrading standalone instance indexes...")
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)
    __upgrade_lucene_index(instance_data_dir=standalone_data_dir,
                           dist_directory=dist_directory,
                           solr_version=solr_version)
    log.info("Upgraded standalone instance indexes...")
예제 #10
0
def upgrade_lucene_standalone_index(
    base_data_dir: str = MC_SOLR_BASE_DATA_DIR, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION
):
    """Upgrade Lucene index using the IndexUpgrader tool to standalone instance."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    l.info("Making sure standalone instance isn't running...")
    port = MC_SOLR_STANDALONE_PORT
    if tcp_port_is_open(port=port):
        raise Exception("Solr standalone instance is running on port %d." % port)
    l.info("Made sure standalone instance isn't running.")

    l.info("Upgrading standalone instance indexes...")
    standalone_data_dir = __standalone_data_dir(base_data_dir=base_data_dir)
    __upgrade_lucene_index(
        instance_data_dir=standalone_data_dir, dist_directory=dist_directory, solr_version=solr_version
    )
    l.info("Upgraded standalone instance indexes...")
예제 #11
0
파일: solr.py 프로젝트: oakieoak/mediacloud
def upgrade_lucene_shards_indexes(base_data_dir: str = MC_SOLR_BASE_DATA_DIR,
                                  dist_directory: str = MC_DIST_DIR,
                                  solr_version: str = MC_SOLR_VERSION):
    """Upgrade Lucene indexes using the IndexUpgrader tool to all shards."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir,
                                                        must_exist=True)

    # Try to guess shard count from how many shards are in data directory
    l.info("Looking for shards...")
    shard_num = 0
    shard_count = 0
    while True:
        shard_num += 1
        shard_data_dir = __shard_data_dir(shard_num=shard_num,
                                          base_data_dir=base_data_dir)
        if os.path.isdir(shard_data_dir):
            shard_count += 1
        else:
            break
    if shard_count < 2:
        raise McSolrRunException("Found less than 2 shards.")
    l.info("Found %d shards." % shard_count)

    l.info("Making sure shards aren't running...")
    for shard_num in range(1, shard_count + 1):
        shard_port = __shard_port(shard_num=shard_num,
                                  starting_port=MC_SOLR_CLUSTER_STARTING_PORT)

        if tcp_port_is_open(port=shard_port):
            raise McSolrRunException("Solr shard %d is running on port %d." %
                                     (shard_num, shard_port))
    l.info("Made sure shards aren't running.")

    l.info("Upgrading shard indexes...")
    for shard_num in range(1, shard_count + 1):
        shard_data_dir = __shard_data_dir(shard_num=shard_num,
                                          base_data_dir=base_data_dir)
        __upgrade_lucene_index(instance_data_dir=shard_data_dir,
                               dist_directory=dist_directory,
                               solr_version=solr_version)
    l.info("Upgraded shard indexes.")
예제 #12
0
def upgrade_lucene_shards_indexes(
    base_data_dir: str = MC_SOLR_BASE_DATA_DIR, dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION
):
    """Upgrade Lucene indexes using the IndexUpgrader tool to all shards."""

    base_data_dir = resolve_absolute_path_under_mc_root(path=base_data_dir, must_exist=True)

    # Try to guess shard count from how many shards are in data directory
    l.info("Looking for shards...")
    shard_num = 0
    shard_count = 0
    while True:
        shard_num += 1
        shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir)
        if os.path.isdir(shard_data_dir):
            shard_count += 1
        else:
            break
    if shard_count < 2:
        raise Exception("Found less than 2 shards.")
    l.info("Found %d shards." % shard_count)

    l.info("Making sure shards aren't running...")
    for shard_num in range(1, shard_count + 1):
        shard_port = __shard_port(shard_num=shard_num, starting_port=MC_SOLR_CLUSTER_STARTING_PORT)

        if tcp_port_is_open(port=shard_port):
            raise Exception("Solr shard %d is running on port %d." % (shard_num, shard_port))
    l.info("Made sure shards aren't running.")

    l.info("Upgrading shard indexes...")
    for shard_num in range(1, shard_count + 1):
        shard_data_dir = __shard_data_dir(shard_num=shard_num, base_data_dir=base_data_dir)
        __upgrade_lucene_index(
            instance_data_dir=shard_data_dir, dist_directory=dist_directory, solr_version=solr_version
        )
    l.info("Upgraded shard indexes.")
예제 #13
0
def run_zookeeper(dist_directory: str = MC_DIST_DIR,
                  listen: str = MC_ZOOKEEPER_LISTEN,
                  port: int = MC_ZOOKEEPER_PORT,
                  data_dir: str = MC_SOLR_BASE_DATA_DIR,
                  zookeeper_version: str = MC_ZOOKEEPER_VERSION,
                  solr_version: str = MC_SOLR_VERSION) -> None:
    """Run ZooKeeper, install if needed too."""
    if not __zookeeper_is_installed():
        log.info("ZooKeeper is not installed, installing...")
        __install_zookeeper()

    data_dir = resolve_absolute_path_under_mc_root(path=data_dir, must_exist=True)

    zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper")
    if not os.path.isdir(zookeeper_data_dir):
        log.info("Creating data directory at %s..." % zookeeper_data_dir)
        mkdir_p(zookeeper_data_dir)

    if tcp_port_is_open(port=port):
        raise McZooKeeperRunException("Port %d is already open on this machine." % port)

    zookeeper_path = __zookeeper_path(dist_directory=dist_directory, zookeeper_version=zookeeper_version)

    zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh")
    if not os.path.isfile(zkserver_path):
        raise McZooKeeperRunException("zkServer.sh at '%s' was not found." % zkserver_path)

    log4j_properties_path = os.path.join(zookeeper_path, "conf", "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McZooKeeperRunException("log4j.properties at '%s' was not found.")

    zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg")
    log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path)

    with open(zoo_cnf_path, 'w') as zoo_cnf:
        zoo_cnf.write("""
#
# This file is autogenerated. Please do not modify it!
#

clientPortAddress=%(listen)s
clientPort=%(port)d
dataDir=%(data_dir)s

# Must be between zkClientTimeout / 2 and zkClientTimeout / 20
tickTime=30000

initLimit=10
syncLimit=10
            """ % {
            "listen": listen,
            "port": port,
            "data_dir": zookeeper_data_dir,
        })

    zookeeper_env = os.environ.copy()
    zookeeper_env["ZOOCFGDIR"] = zookeeper_data_dir  # Serves as configuration dir too
    zookeeper_env["ZOOCFG"] = "zoo.cfg"
    zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir
    zookeeper_env["SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(log4j_properties_path)

    args = [
        zkserver_path,
        "start-foreground"
    ]

    log.info("Starting ZooKeeper on %s:%d..." % (listen, port))
    log.debug("Running command: %s" % str(args))
    log.debug("Environment variables: %s" % str(zookeeper_env))

    process = subprocess.Popen(args, env=zookeeper_env)
    global __zookeeper_pid
    __zookeeper_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_zookeeper_process)  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_zookeeper_process)

    log.info("ZooKeeper PID: %d" % __zookeeper_pid)

    log.info("Waiting for ZooKeeper to start at port %d..." % port)
    zookeeper_started = wait_for_tcp_port_to_open(port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES)
    if not zookeeper_started:
        raise McZooKeeperRunException("Unable to connect to ZooKeeper at port %d" % port)

    log.info("Uploading initial Solr collection configurations to ZooKeeper...")
    update_zookeeper_solr_configuration(zookeeper_host="localhost",
                                        zookeeper_port=port,
                                        dist_directory=dist_directory,
                                        solr_version=solr_version)

    log.info("ZooKeeper is ready on port %d!" % port)
    while True:
        time.sleep(1)
예제 #14
0
def __solr_home_path(solr_home_dir: str = MC_SOLR_HOME_DIR) -> str:
    """Return path to Solr home (with collection subdirectories)."""
    solr_home_path = resolve_absolute_path_under_mc_root(path=solr_home_dir, must_exist=True)
    return solr_home_path
예제 #15
0
def __raise_if_old_shards_exist() -> None:
    """Raise exception with migration instructions if old shard directories exist already."""

    pwd = resolve_absolute_path_under_mc_root(path=".")
    old_shards = glob.glob(pwd + "/mediacloud-shard-*")

    if len(old_shards) == 0:
        # No old shards to migrate
        return

    num_shards = 0
    for old_shard_path in old_shards:
        old_shard_dir = os.path.basename(old_shard_path)

        old_shard_num = re.search(r"^mediacloud-shard-(\d+?)$", old_shard_dir)
        if old_shard_num is None:
            raise Exception("Unable to parse shard number for old shard directory '%s'" % old_shard_dir)
        old_shard_num = int(old_shard_num.group(1))

        num_shards = max(num_shards, old_shard_num)

    exc_message = "Old shards were found at paths:\n\n"
    for old_shard_path in old_shards:
        exc_message += "* %s\n" % old_shard_path

    exc_message += "\n"
    exc_message += "Please migrate them by running:\n"
    exc_message += "\n"
    exc_message += "cd %s\n" % pwd
    exc_message += "\n"
    exc_message += "# Create empty new shard directory structure for each shard:\n"
    for shard_num in range(1, num_shards + 1):
        exc_message += (
            "./run_solr_shard.py --shard_num %(shard_num)d --shard_count %(shard_count)d "
            + '|| echo "It\'s fine to fail at this point."\n'
        ) % {"shard_num": shard_num, "shard_count": num_shards}

    exc_message += "\n"
    exc_message += "# Move data from old shards to new ones\n"
    for shard_num in range(1, num_shards + 1):
        shard_solr_path = "mediacloud-shard-%d/solr/" % shard_num
        shard_collection_paths = glob.glob(shard_solr_path + "/collection*")
        if len(shard_collection_paths) == 0:
            raise Exception("No collections found in shard '%d'" % shard_num)
        for collection_path in shard_collection_paths:
            collection_name = os.path.basename(collection_path)

            src_collection_data_path = os.path.join(shard_solr_path, collection_name, "data")
            if not os.path.isdir(src_collection_data_path):
                raise Exception("Source data directory '%s' does not exist." % src_collection_data_path)

            dst_shard_data_dir = __shard_data_dir(shard_num=shard_num)
            dst_collection_data_path = os.path.join(dst_shard_data_dir, collection_name, "data")
            if os.path.isdir(dst_collection_data_path):
                raise Exception("Destination data directory '%s' already exists." % dst_collection_data_path)

            exc_message += "mv %(src_collection_data_dir)s %(dst_collection_data_dir)s\n" % {
                "src_collection_data_dir": src_collection_data_path,
                "dst_collection_data_dir": dst_collection_data_path,
            }
        exc_message += "\n"

    exc_message += "# Remove old shards\n"
    for shard_num in range(1, num_shards + 1):
        exc_message += "rm -rf mediacloud-shard-%d/\n" % shard_num

    raise Exception(exc_message)
예제 #16
0
파일: solr.py 프로젝트: oakieoak/mediacloud
def __raise_if_old_shards_exist() -> None:
    """Raise exception with migration instructions if old shard directories exist already."""

    pwd = resolve_absolute_path_under_mc_root(path=".")
    old_shards = glob.glob(pwd + "/mediacloud-shard-*")

    if len(old_shards) == 0:
        # No old shards to migrate
        return

    num_shards = 0
    for old_shard_path in old_shards:
        old_shard_dir = os.path.basename(old_shard_path)

        old_shard_num = re.search(r'^mediacloud-shard-(\d+?)$', old_shard_dir)
        if old_shard_num is None:
            raise McSolrRunException(
                "Unable to parse shard number for old shard directory '%s'" %
                old_shard_dir)
        old_shard_num = int(old_shard_num.group(1))

        num_shards = max(num_shards, old_shard_num)

    exc_message = "Old shards were found at paths:\n\n"
    for old_shard_path in old_shards:
        exc_message += "* %s\n" % old_shard_path

    exc_message += "\n"
    exc_message += "Please migrate them by running:\n"
    exc_message += "\n"
    exc_message += "cd %s\n" % pwd
    exc_message += "\n"
    exc_message += "# Create empty new shard directory structure for each shard:\n"
    for shard_num in range(1, num_shards + 1):
        exc_message += (
            "./run_solr_shard.py --shard_num %(shard_num)d --shard_count %(shard_count)d "
            + "|| echo \"It's fine to fail at this point.\"\n") % {
                "shard_num": shard_num,
                "shard_count": num_shards,
            }

    exc_message += "\n"
    exc_message += "# Move data from old shards to new ones\n"
    for shard_num in range(1, num_shards + 1):
        shard_solr_path = "mediacloud-shard-%d/solr/" % shard_num
        shard_collection_paths = glob.glob(shard_solr_path + "/collection*")
        if len(shard_collection_paths) == 0:
            raise McSolrRunException("No collections found in shard '%d'" %
                                     shard_num)
        for collection_path in shard_collection_paths:
            collection_name = os.path.basename(collection_path)

            src_collection_data_path = os.path.join(shard_solr_path,
                                                    collection_name, "data")
            if not os.path.isdir(src_collection_data_path):
                raise McSolrRunException(
                    "Source data directory '%s' does not exist." %
                    src_collection_data_path)

            dst_shard_data_dir = __shard_data_dir(shard_num=shard_num)
            dst_collection_data_path = os.path.join(dst_shard_data_dir,
                                                    collection_name, "data")
            if os.path.isdir(dst_collection_data_path):
                raise McSolrRunException(
                    "Destination data directory '%s' already exists." %
                    dst_collection_data_path)

            exc_message += "mv %(src_collection_data_dir)s %(dst_collection_data_dir)s\n" % {
                "src_collection_data_dir": src_collection_data_path,
                "dst_collection_data_dir": dst_collection_data_path,
            }
        exc_message += "\n"

    exc_message += "# Remove old shards\n"
    for shard_num in range(1, num_shards + 1):
        exc_message += "rm -rf mediacloud-shard-%d/\n" % shard_num

    raise McSolrRunException(exc_message)
예제 #17
0
파일: solr.py 프로젝트: oakieoak/mediacloud
def __solr_home_path(solr_home_dir: str = MC_SOLR_HOME_DIR) -> str:
    """Return path to Solr home (with collection subdirectories)."""
    solr_home_path = resolve_absolute_path_under_mc_root(path=solr_home_dir,
                                                         must_exist=True)
    return solr_home_path
예제 #18
0
def __solr_path(dist_directory: str = MC_DIST_DIR, solr_version: str = MC_SOLR_VERSION) -> str:
    """Return path to where Solr distribution should be located."""
    dist_path = resolve_absolute_path_under_mc_root(path=dist_directory, must_exist=True)
    solr_directory = "solr-%s" % solr_version
    solr_path = os.path.join(dist_path, solr_directory)
    return solr_path
예제 #19
0
def run_zookeeper(dist_directory: str = MC_DIST_DIR,
                  listen: str = MC_ZOOKEEPER_LISTEN,
                  port: int = MC_ZOOKEEPER_PORT,
                  data_dir: str = MC_SOLR_BASE_DATA_DIR,
                  zookeeper_version: str = MC_ZOOKEEPER_VERSION,
                  solr_version: str = MC_SOLR_VERSION) -> None:
    """Run ZooKeeper, install if needed too."""
    if not __zookeeper_is_installed():
        log.info("ZooKeeper is not installed, installing...")
        __install_zookeeper()

    data_dir = resolve_absolute_path_under_mc_root(path=data_dir,
                                                   must_exist=True)

    zookeeper_data_dir = os.path.join(data_dir, "mediacloud-cluster-zookeeper")
    if not os.path.isdir(zookeeper_data_dir):
        log.info("Creating data directory at %s..." % zookeeper_data_dir)
        mkdir_p(zookeeper_data_dir)

    if tcp_port_is_open(port=port):
        raise McZooKeeperRunException(
            "Port %d is already open on this machine." % port)

    zookeeper_path = __zookeeper_path(dist_directory=dist_directory,
                                      zookeeper_version=zookeeper_version)

    zkserver_path = os.path.join(zookeeper_path, "bin", "zkServer.sh")
    if not os.path.isfile(zkserver_path):
        raise McZooKeeperRunException("zkServer.sh at '%s' was not found." %
                                      zkserver_path)

    log4j_properties_path = os.path.join(zookeeper_path, "conf",
                                         "log4j.properties")
    if not os.path.isfile(log4j_properties_path):
        raise McZooKeeperRunException(
            "log4j.properties at '%s' was not found.")

    zoo_cnf_path = os.path.join(zookeeper_data_dir, "zoo.cfg")
    log.info("Creating zoo.cfg in '%s'..." % zoo_cnf_path)

    with open(zoo_cnf_path, 'w') as zoo_cnf:
        zoo_cnf.write("""
#
# This file is autogenerated. Please do not modify it!
#

clientPortAddress=%(listen)s
clientPort=%(port)d
dataDir=%(data_dir)s

# Must be between zkClientTimeout / 2 and zkClientTimeout / 20
tickTime=30000

initLimit=10
syncLimit=10
            """ % {
            "listen": listen,
            "port": port,
            "data_dir": zookeeper_data_dir,
        })

    zookeeper_env = os.environ.copy()
    zookeeper_env[
        "ZOOCFGDIR"] = zookeeper_data_dir  # Serves as configuration dir too
    zookeeper_env["ZOOCFG"] = "zoo.cfg"
    zookeeper_env["ZOO_LOG_DIR"] = zookeeper_data_dir
    zookeeper_env[
        "SERVER_JVMFLAGS"] = "-Dlog4j.configuration=file://" + os.path.abspath(
            log4j_properties_path)

    args = [zkserver_path, "start-foreground"]

    log.info("Starting ZooKeeper on %s:%d..." % (listen, port))
    log.debug("Running command: %s" % str(args))
    log.debug("Environment variables: %s" % str(zookeeper_env))

    process = subprocess.Popen(args, env=zookeeper_env)
    global __zookeeper_pid
    __zookeeper_pid = process.pid

    # Declare that we don't care about the exit code of the child process so
    # it doesn't become a zombie when it gets killed in signal handler
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    signal.signal(signal.SIGTERM, __kill_zookeeper_process
                  )  # SIGTERM is handled differently for whatever reason
    atexit.register(__kill_zookeeper_process)

    log.info("ZooKeeper PID: %d" % __zookeeper_pid)

    log.info("Waiting for ZooKeeper to start at port %d..." % port)
    zookeeper_started = wait_for_tcp_port_to_open(
        port=port, retries=MC_ZOOKEEPER_CONNECT_RETRIES)
    if not zookeeper_started:
        raise McZooKeeperRunException(
            "Unable to connect to ZooKeeper at port %d" % port)

    log.info(
        "Uploading initial Solr collection configurations to ZooKeeper...")
    update_zookeeper_solr_configuration(zookeeper_host="localhost",
                                        zookeeper_port=port,
                                        dist_directory=dist_directory,
                                        solr_version=solr_version)

    log.info("ZooKeeper is ready on port %d!" % port)
    while True:
        time.sleep(1)