Exemplo n.º 1
0
def init_csv():
    """Initialize the data directory so that the right things exist.

    Preprocesses the haiku, and writes them to a CSV file.
    """
    haiku = read_from_file()
    haiku = (preprocess(h) for h in haiku)
    # Removes duplicates in a manner that preserves order. Requires Python 3.6+
    print("Preprocessing...")
    haiku = list(dict.fromkeys(haiku))
    print("Counting lines...")
    lines = [h.count("/") + 1 for h in haiku]
    print("Finding colors...")
    colors = [find_colors(pos_tag(h)) for h in haiku]
    print("Counting syllables...")
    syllables = [estimate_syllables(h) for h in haiku]
    total_syllables = [sum(s) for s in syllables]

    rows = {
        "haiku": haiku,
        "colors": colors,
        "lines": lines,
        "syllables": syllables,
        "total_syllables": total_syllables,
    }

    df = pd.DataFrame(rows)
    df.to_csv(get_data_dir() / "haiku.csv")
Exemplo n.º 2
0
def load_model() -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    # TODO: Set the model path more intelligently.
    model_path = get_data_dir() / "models" / "gpt2"
    model: PreTrainedModel = GPT2LMHeadModel.from_pretrained(str(model_path))
    tokenizer: PreTrainedTokenizer = GPT2Tokenizer.from_pretrained(
        str(model_path))
    return model, tokenizer
Exemplo n.º 3
0
def generate_haiku(
    prompt: str = Query(
        None,
        description=
        "The prompt to begin the generated haiku with. If not given, a random one will be chosen.",
        max_length=50,
    ),
    seed: int = Query(
        None,
        description=
        "Seed the RNG. If not given, a random seed will be generated, used, and returned in the JSON response for reproducibility.",
        gt=0,
        lt=2**32,
    ),
    number: int = Query(5,
                        description="The number of haiku to generate.",
                        gt=0,
                        le=20),
    temperature: float = Query(
        1.0,
        description=
        "The temperature to use when generating the haiku. Higher temperatures result in more randomness.",
        gt=0,
    ),
    k: int = Query(
        0,
        description=
        "The number of highest probability vocabulary tokens to keep for top-k filtering.",
        ge=0,
    ),
    p: float = Query(
        0.9,
        description=
        "The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling.",
        ge=0,
        le=1,
    ),
    max_tokens: int = Query(
        20,
        description="The max length of the sequence to be generated.",
        gt=0,
    ),
):
    """Generate a random haiku based on the given prompt."""
    if prompt is None:
        prompt = get_random_prompt()
    if seed is None:
        seed = random.randint(0, 2**32 - 1)
    df = generate(prompt, seed, number, temperature, k, p, max_tokens)
    logger.debug("Saving generated DataFrame to data/generated.csv")
    with open(get_data_dir() / "generated.csv",
              "a",
              encoding="utf-8",
              errors="ignore") as f:
        df.to_csv(f, mode="a", header=(f.tell() == 0), index=False)

    # TODO: This is a fairly expensive request. Profile and see what it would take to optimize.
    # TODO: Return the index for each generated haiku so that they're retrievable from the /generated/{n} links
    return {"seed": seed, "prompt": prompt, "haiku": df["haiku"]}
Exemplo n.º 4
0
def __get_colors() -> pd.DataFrame:
    """Get a DataFrame of color -> HTML colors.

    Note that this CSV file uses hex RGB color codes for many of the colors, but falls back to using
    HTML named colors for colors without an RGB value.

    The colors with RGB values came from https://xkcd.com/color/rgb/ while the colors with the named
    values came from
    https://medium.com/@eleanorstrib/python-nltk-and-the-digital-humanities-finding-patterns-in-gothic-literature-aca84639ceeb
    """
    return pd.read_csv(get_data_dir() / "colors.csv", index_col=0)
Exemplo n.º 5
0
def read_from_file() -> list:
    """Get a list of unclean haiku from the text file.

    Each haiku is a single string, with lines separated by `/`, and an end-of-haiku symbol `#`.
    """
    haikus = []
    with open(get_data_dir() / "haiku.txt", "r", encoding="utf-8") as datafile:
        haiku = ""
        for line in datafile:
            line = line.strip()
            if line:
                if haiku:
                    # Separate the /'s by spaces so that str.split() works more intuitively.
                    haiku += " / "
                haiku += line
            elif not line and haiku:
                haikus.append(haiku)
                haiku = ""
    return haikus