示例#1
0
def get_jaccard_distribution(pf, args, start_t):
    """Samples random pairs and saves their Jaccard similarity to CSV.

    The result is used by jaccard_distribution.R to fit an
    distribution to the data.
    """
    csv = CsvWriter(args.results, append=True)
    csv.write_header(['u1', 'u2', 'jac_sim', 'sig_sim'])

    for i, (u1, u2) in enumerate(
            zip(np.random.permutation(pf.n_docs),
                np.random.permutation(pf.n_docs))):
        if u1 == u2: next
        print("Wrote {} similarities".format(i), end='\r')
        csv.write([u1, u2, pf.jaccard_similarity(u1, u2), pf.sig_sim(u1, u2)])

    return True
示例#2
0
def get_candidate_distribution(pf, args, start_t):
    """Samples pairs from candidates, stratified by signature similarity.

    The result is stored in a CSV, including the signature similarity,
    Jaccard similarity, and weight (> 1 if there were more samples in
    the stratum than max_per_step, to get approximately accurate
    frequencies).
    """
    candidates = list(pf.candidates())

    csv_file = args.results
    csv = CsvWriter(csv_file, append=True)
    csv.write_header([
        'run_id', 'u1', 'u2', 'sig_sim', 'jac_sim', 'sig_len', 'bands',
        'max_buckets', 'used_buckets', 'weight'
    ])

    run_id = datetime.now().isoformat()

    lim = 0
    step = 0.05
    max_per_step = 100
    while lim < 1:
        cand = [(c, sim) for c, sim in candidates
                if sim >= lim and sim < lim + step]
        n = len(cand)
        print("{} candidates between {} and {}".format(n, lim, lim + step))

        weight = max(n, max_per_step) / max_per_step
        for i in np.random.permutation(n)[:max_per_step]:
            (c1, c2), sim = cand[i]
            csv.write([
                run_id, c1, c2, sim,
                pf.jaccard_similarity(c1, c2), pf.sig_len, pf.n_bands,
                pf.max_buckets,
                len(pf.buckets), weight
            ])

        lim += step

    return True
示例#3
0
import time
import subprocess
import os
import gc

import data
from csv_writer import CsvWriter
from util import ensure_directory
"""
This file simulates the evaluation environment and stores results
as a CSV.
"""

csv = CsvWriter("diagnostics/out/evaluation.csv", append=True)
csv.write_header([
    'note', 'batch', 'run', 'found', 'incorrect', 'time', 'ppm', 'terminated'
])


def jaccard_sim(data, u1, u2):
    m1 = data.movie[data.user == u1]
    m2 = data.movie[data.user == u2]
    return len(np.intersect1d(m1, m2)) / len(np.union1d(m1, m2))


def run_evaluation(note, batch=0, runs=5):
    for run in range(runs):
        cmd = [
            "python3",
            "main.py",
            "--rows",
示例#4
0
import numpy as np
import multiprocessing as mp
from sklearn.model_selection import ParameterGrid
from datetime import datetime
import time
import subprocess
import os

from csv_writer import CsvWriter
from util import ensure_directory

csv = CsvWriter("diagnostics/out/experiments.csv", append=True)
csv.write_header(
    ['batch_id', 'run_id', 'bands', 'rows', 'max_buckets', 'time', 'count'])

batch_id = datetime.now().isoformat()


def count_lines_in_file(filename):
    if not os.path.exists(filename):
        return 0

    count = 0
    with open(filename) as f:
        for l in f:
            if l.strip() != '':
                count += 1
    return count


def perform(params):