def analyze(ctx, id):
    """
    Analyze permutes an ad's text to find which words contribute most to its political rating.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }
    records = DB.query("""select * from ads where id = '{}'""".format(id))

    idx = 0
    for record in records:
        record_lang = record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = clean_text(get_text(record), record["advertiser"])
            baseline = text_probability(classifier, text)
            permuted_texts = permute_text(text)

            diffs = [(deleted_word,
                      baseline - text_probability(classifier, permuted_text))
                     for (deleted_word, permuted_text) in permuted_texts]

            print("text: {}".format(text))
            print("original probability: {}".format(baseline))
            biggest_diffs = sorted(
                diffs, key=lambda word_diff: -abs(word_diff[1]))[:4]
            print("top difference-makers:")
            for (deleted_word, permuted_text) in biggest_diffs:
                print(" - {}, {}".format(deleted_word, permuted_text))
Exemplo n.º 2
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
        else:
            langs = map(lambda x: "'{}'".format(x), classifiers.keys())
            langs = ','.join(langs)

            query = query + " and lang in ({})".format(langs)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = classifier["vectorizer"].transform([get_text(record)])
            probability = classifier["classifier"].predict_proba(text)[0][1]
            update = {"id": record["id"], "probability": probability}
            if record["political_probability"] > update[
                    "probability"] and record[
                        "political_probability"] >= 0.70 and update[
                            "probability"] < 0.70 and not record["suppressed"]:
                print("refusing to downgrade probability of ad {}".format(
                    record["id"]))
            updates.append(update)
            out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[probability]}"
            print(out.format(pid=update, info={"length": length, "idx": idx}))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
Exemplo n.º 3
0
def diagnostics(ctx):
    """
    Warning! Slow! Run all classifiers against our database
    """
    for (directory, conf) in confs(ctx.obj["base"]):
        for name, classifier in get_classifiers().items():
            print("Report for {} in {}".format(name, conf["language"]))
            train_classifier(classifier, get_vectorizer(conf), directory, conf["language"])
Exemplo n.º 4
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    length = DB.query("select count(*) as length from ({}) as t1;".format(
        query))[0]["length"]
    records = DB.query(query)
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        if record["lang"] in classifiers:
            classifier = classifiers[record["lang"]]
            text = classifier["vectorizer"].transform(
                [get_text(record["html"])])
            update = {
                "id": record["id"],
                "probability":
                classifier["classifier"].predict_proba(text)[0][1]
            }
            updates.append(update)

            print(
                "Classified {p[id]} ({l[idx]} of {l[length]}) with {p[probability]}"
                .format(p=update, l={
                    "length": length,
                    "idx": idx
                }))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
Exemplo n.º 5
0
def build(ctx):
    """
    Build classifiers for each of our languages.
    """
    for (directory, conf) in confs(ctx.obj["base"]):
        model = train_classifier(get_classifier(), get_vectorizer(conf),
                                 directory, conf["language"])
        model_path = classifier_path(directory)
        with open(model_path, 'wb') as classy:
            dill.dump(model, classy)
        print("Saved model {}".format(model_path))