Exemplo n.º 1
0
def deploy(target, url, force=False):
    if url is None:
        raise click.BadParameter("Please set instance asset url in INSTANCE_URL enviroment variable")

    if instance_path().exists() and not force:
        logger.info("Skipping deployment; Instance folder exists")
        return

    logger.debug("Retrieving instance folder from %s", url)
    local_filename, headers = urllib.request.urlretrieve(url)
    print(local_filename, headers)
    with ZipFile(local_filename) as instance_zip:
        instance_zip.extractall(target)
Exemplo n.º 2
0
def _instance_path(prefix=number_topics):
    path = instance_path() / "lda" / str(prefix)
    path.mkdir(exist_ok=True, parents=True)
    return path
Exemplo n.º 3
0
def build(target, method: list, dataset_name, limit: int, number_of_topics):
    """
    Build page.

    :param target: Target file
    :param method: List of methods to use.
    :param limit: Limit processing into N candidates.
    """

    click.echo("Loading dataset ... ", nl=False)

    dataset = importlib.import_module(f".{dataset_name}", "agora_analytica.data")
    df = dataset.load_dataset()

    if limit < 2:
        raise click.BadParameter("Build should include more than 2 candidates.", param_hint="--limit")
    df = df.sample(min(limit, df.shape[0]))
    click.echo("[DONE]")

    click.echo("Calculating distances ... ", nl=False)
    distances = measure_distances(df, methods=method)
    click.echo("[DONE]")

    click.echo("Analyzing text ... ", nl=False)

    if number_of_topics == -1:
        # Using squareroot seems to provide pretty good default
        number_of_topics = settings.getint("build", "number_of_topics", fallback=np.sqrt(limit))
    number_of_topics = int(number_of_topics)
    settings.set("build", "number_of_topics", str(number_of_topics))

    click.echo(f"Topics: {number_of_topics} ", nl=False)

    texts_df = df.text_answers().sort_index()
    visualization = settings.getboolean('build', 'generate_visualization', fallback=debug)

    topics = TextTopics(texts_df, number_topics=number_of_topics, generate_visualization=visualization)
    words = {}

    n = texts_df.shape[0]

    for a in range(n):
        for b in range(a + 1, n):
            a_idx = texts_df.index[a]
            b_idx = texts_df.index[b]
            r = topics.compare_rows(texts_df, a_idx, b_idx)
            if r:
                words[(a_idx, b_idx)] = r[0][1]
                words[(b_idx, a_idx)] = r[1][1]

    click.echo("[DONE]")

    click.echo("Generating structures ... ", nl=False)
    data_nodes = [{
        "id": int(idx),
        "name": row.get("name"),
        "party": row.get("party"),
        "image": row.get("image", None),
        "constituency": row.get("vaalipiiri"),
        "number": int(row.get("number", -1))
    } for idx, row in df.replace(np.NaN, None).iterrows()]

    data_links = [{
        "source": int(i),
        "source_term": words.get((i, l), None),
        "distance": float(d),
        "target_term": words.get((l, i), None),
        "target": int(l)
    } for i, d, l in distances.values]
    click.echo("[DONE]")

    # Build static pages
    _build_pages(target / "pages")

    click.echo("Writing data ... ", nl=False)
    _write("nodes", data_nodes, target)
    _write("links", data_links, target)
    cfg = instance_path() / "app.cfg"
    with cfg.open('w') as f:
        settings.write(f, space_around_delimiters=True)
    click.echo("[DONE]")
Exemplo n.º 4
0
def _write(file, data, target=instance_path()):
    """ Helper to write data into json file """

    with open(os.path.join(target, f"{file}.json"), 'w') as f:
        f.write(jsonify(data, indent=(4 if debug else 0)))
Exemplo n.º 5
0
debug = False

settings = config()


def _write(file, data, target=instance_path()):
    """ Helper to write data into json file """

    with open(os.path.join(target, f"{file}.json"), 'w') as f:
        f.write(jsonify(data, indent=(4 if debug else 0)))


@click.group()
@click.option("--debug/--no-debug", default=debug, help="Show debug output")
@click.option("--config", default=instance_path() / "app.cfg", help="Config file")
def cli(debug, config):
    globals()['debug'] = debug
    logging.basicConfig(level=(logging.DEBUG if debug else logging.INFO))
    settings.read(config)


@cli.command()
@click.option("--target", type=click.Path(file_okay=False),
                          default=Path.cwd(),
                          show_default=True)
@click.option("--url", default=os.environ.get("INSTANCE_URL", None),
                       show_default=True)
def deploy(target, url, force=False):
    if url is None:
        raise click.BadParameter("Please set instance asset url in INSTANCE_URL enviroment variable")
Exemplo n.º 6
0
import click
from agora_analytica import instance_path
from agora_analytica.data.utils import generate_names
from agora_analytica.data.interpolation.wikidata import finnish_politicians
import pandas as pd

# Extra attributes to append into image url. By default wikipedia uses 300px wide
# images, so it's good enought for us.
IMAGE_URL_APPEND = "?width=300px"


@click.command()
@click.argument('file',
                type=click.Path(file_okay=True, dir_okay=False, exists=True),
                default=instance_path() / "nodes.json")
def cli_obfuscate(file):
    """ Obfuscate contents of node FILE """

    with open(file, mode="r+") as fp:
        df = pd.read_json(fp, orient="records")

        # Check for image using name
        images = politician_pictures()
        for idx, row in df.iterrows():
            name = row['name'].lower().strip()
            img = images.get(name, None)
            df.loc[idx, "image"] = img + IMAGE_URL_APPEND if img else None

        # Generate fake names
        df = fake_names(df)
Exemplo n.º 7
0
def test_instancepath():
    path = instance_path()
    assert isinstance(path, Path)
    assert path.is_dir()
Exemplo n.º 8
0
debug = False

settings = config()


def _write(file, data, target=instance_path()):
    """ Helper to write data into json file """

    with open(os.path.join(target, f"{file}.json"), 'w') as f:
        f.write(jsonify(data, indent=(4 if debug else 0)))


@click.group()
@click.option("--debug/--no-debug", default=debug, help="Show debug output")
@click.option("--config",
              default=instance_path() / "app.cfg",
              help="Config file")
def cli(debug, config):
    globals()['debug'] = debug
    logging.basicConfig(level=(logging.DEBUG if debug else logging.INFO))
    settings.read(config)


@cli.command()
@click.option("--target",
              type=click.Path(file_okay=False),
              default=(instance_path() / "..").resolve(),
              show_default=True)
@click.option("--url",
              default=os.environ.get("INSTANCE_URL", None),
              show_default=True)
Exemplo n.º 9
0
def build(target, method: list, dataset_name, limit: int, number_of_topics):
    """
    Build page.

    :param target: Target file
    :param method: List of methods to use.
    :param limit: Limit processing into N candidates.
    """

    click.echo("Loading dataset ... ", nl=False)

    dataset = importlib.import_module(f".{dataset_name}",
                                      "agora_analytica.data")
    df = dataset.load_dataset()

    if limit < 2:
        raise click.BadParameter(
            "Build should include more than 2 candidates.",
            param_hint="--limit")

    preferred_list_file = settings.get("build",
                                       "preferred_candidates",
                                       fallback=None)

    if preferred_list_file:
        with open(preferred_list_file) as fp:
            # Fetch all preferred candidates by row, skipping ones beginning with `#`
            preferred_candidates = filter(lambda x: x != "" and x[0] != "#",
                                          map(str.strip, fp.readlines()))
        # Slice preferred candidates
        preferred_filter = df["name"].isin(preferred_candidates)
        preferred = df[preferred_filter]

        # Fill to a required ammount with sampled data
        df = preferred.append(df[~preferred_filter].sample(
            clamp(df.shape[0] - preferred.shape[0], limit - preferred.shape[0],
                  0)))
        del preferred, preferred_filter

    # sample to a correct size
    df = df.sample(min(limit, df.shape[0]))
    click.echo("[DONE]")

    click.echo("Calculating distances ... ", nl=False)
    distances = measure_distances(df, methods=method)
    click.echo("[DONE]")

    click.echo("Analyzing text ... ", nl=False)

    if number_of_topics == -1:
        # Using squareroot seems to provide pretty good default
        number_of_topics = settings.getint("build",
                                           "number_of_topics",
                                           fallback=np.sqrt(limit))
    number_of_topics = int(number_of_topics)
    settings.set("build", "number_of_topics", str(number_of_topics))

    click.echo(f"Topics: {number_of_topics} ", nl=False)

    texts_df = df.text_answers().sort_index()
    visualization = settings.getboolean('build',
                                        'generate_visualization',
                                        fallback=debug)

    topics = TextTopics(texts_df,
                        number_topics=number_of_topics,
                        generate_visualization=visualization)
    words = {}

    n = texts_df.shape[0]

    talkinpoints = {}

    for a in range(n):
        a_idx = texts_df.index[a]
        for b in range(a + 1, n):
            b_idx = texts_df.index[b]
            r = topics.compare_rows(texts_df, a_idx, b_idx)
            if r:
                words[(a_idx, b_idx)] = r[0][1]
                words[(b_idx, a_idx)] = r[1][1]

        talkinpoints[a_idx] = topics.find_talkingpoint(texts_df.loc[a_idx])

    click.echo("[DONE]")

    click.echo("Generating structures ... ", nl=False)
    data_nodes = [{
        "id": int(idx),
        "name": row.get("name"),
        "party": row.get("party"),
        "image": row.get("image", None),
        "constituency": row.get("constituency"),
        "number": int(row.get("number", -1)),
        "talkinpoint": talkinpoints.get(int(idx), None)
    } for idx, row in df.replace(np.NaN, None).iterrows()]

    data_links = [{
        "source": int(i),
        "source_term": words.get((i, l), None),
        "distance": float(d),
        "target_term": words.get((l, i), None),
        "target": int(l)
    } for i, d, l in distances.values]
    click.echo("[DONE]")

    # Build static pages
    _build_pages(target / "pages")

    click.echo("Writing data ... ", nl=False)
    _write("nodes", data_nodes, target)
    _write("links", data_links, target)
    cfg = instance_path() / "app.cfg"
    with cfg.open('w') as f:
        settings.write(f, space_around_delimiters=True)
    click.echo("[DONE]")