예제 #1
0
def scan(phrase, norm_params, n_override=None):
    phrases = None
    if type(phrase) is list or type(phrase) is tuple:
        phrases = phrase
    else:
        phrases = [phrase]
    if n_override is not None:
        n = n_override
    else:
        n = len(phrases[0].split(' '))
    counts = {phrase:{ys:0 for ys in util.iter_yearseason()} for phrase in phrases}
    for ys in util.iter_yearseason():
        unmet_phrases = set(phrases)
        count_adjust = norm_params[ys]
        for row in util.load_grams(n, ys):
            rphrase=row[1]
            for phrase in unmet_phrases:
                if rphrase == phrase:
                    unmet_phrases.remove(phrase)
                    count=row[0]
                    counts[phrase][ys]=int(count)/count_adjust
                    break #Only one phrase can match
            if len(unmet_phrases) == 0:
                break #No more phrases to be found this interval
    return counts
예제 #2
0
def draw_html_heatmap_from_data(n, phrase, ar, l):
    opts = np.get_printoptions()
    np.set_printoptions(threshold=sys.maxsize)
    ar, l = sort_row_data(ar, l)
    ar = list(reversed(ar))
    l = list(reversed(l))

    fn, url = util.get_web_fn("graphs", "html_heatmaps", slugify(phrase))

    yss = [ys for ys in util.iter_yearseason()]

    if len(ar) == 0:
        sys.err.writeln("%s,%s has no values\n" % (n, phrase))
        return url

    with open(fn, "w+") as f:
        f.write("""<html>
<head>
<style>
table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
tr:nth-child(even) {background-color: #d2d2d2;
</style>
</head>
<body>
""")

        f.write('<h3>Phrase: %s</h3>' % (phrase, ))
        f.write(
            '<h4><a href="%s" target="_blank">Graphical Heatmap</a></h4>\n' %
            (get_heatmap_url(n, phrase), ))
        f.write('<table>\n')
        headers = ["Domain"] + list(util.iter_yearseason())
        f.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers))
        for domId in range(len(ar)):
            f.write('<tr>\n')
            f.write('<td>%s</td>\n' % l[domId])
            for ysId in range(len(ar[0])):
                if ar[domId][ysId] == 1:
                    policy_url = "https://privacypolicies.cs.princeton.edu/fetch_policy.php?domain=%s&interval=%s_%s" % (
                        l[domId], yss[ysId][:4], yss[ysId][4])  #Policy URL
                    f.write('<td><a href="%s">%s</a></td>\n' %
                            (policy_url, yss[ysId]))
                else:
                    f.write('<td></td>\n')

        f.write('</table>\n')
        f.write("</body></html>")

    np.set_printoptions(opts)
    return url
def scan_for_phrase_doms(phrase, n):
    phrases = None
    if type(phrase) is list or type(phrase) is tuple:
        phrases = phrase
    else:
        phrases = [phrase]
    dom_hist = {
        phrase: {ys: []
                 for ys in util.iter_yearseason()}
        for phrase in phrases
    }
    for ys in util.iter_yearseason():
        unmet_phrases = set(phrases)
        for row in util.load_grams(n, ys):
            rphrase = row[1]
            if rphrase in unmet_phrases:
                unmet_phrases.remove(rphrase)
                doms = row[2:]
                dom_hist[rphrase][ys] = doms
            if len(unmet_phrases) == 0:
                break  #No more phrases to be found this interval
    return dom_hist
예제 #4
0
def load_grams_parallel(n,
                        limit=None,
                        search_for=None,
                        recount=False,
                        domain_norm_factor=None,
                        policy_norm_factor=None):
    util.get_blacklist()  #Ensure this is loaded first

    def queue_grams(q, n, yearseas, limit, search_for):
        for row in load_grams(n, yearseas, limit=limit):
            if search_for is not None:
                s = row[1]
                if s not in search_for:
                    continue

            if recount:
                counts = {}
                doms = row[2:]
                for count_name, (
                        countf,
                        count_friendly_name) in util.count_fxns.items():
                    counts[count_name] = countf(
                        doms,
                        yearseason=yearseas,
                        domain_norm_factor=domain_norm_factor,
                        policy_norm_factor=policy_norm_factor)
                row = (counts, *row[1:])
            q.put((yearseas, row))

    q = mp.Queue()
    procs = []
    queuefxn = queue_grams
    for yearseas in util.iter_yearseason():
        p = mp.Process(target=queuefxn,
                       args=(q, n, yearseas, limit, search_for))
        p.start()
        procs.append(p)

    while any((p.is_alive() for p in procs)):
        try:
            yield q.get(True, 10)
        except queue.Empty:
            pass
            #if any((p.is_alive() for p in procs)):
            #sys.stderr.write("Unexpected empty queue. Trying again.\n")

    q.close()
예제 #5
0
def draw_heatmap_from_data(n, phrase, ar, l):
    opts = np.get_printoptions()
    np.set_printoptions(threshold=sys.maxsize)
    ar, l = sort_row_data(ar, l)

    fn, url = util.get_web_fn("graphs", "heatmaps", slugify(phrase))

    yss = [ys for ys in util.iter_yearseason()]

    fig = go.Figure(data=go.Heatmap(z=ar,
                                    y=l,
                                    x=yss,
                                    colorscale=[(0, "#d0d3d4"), (1,
                                                                 "#1a5276")],
                                    showscale=False))
    plotly.offline.plot(fig, filename=fn, auto_open=False)
    np.set_printoptions(opts)
    return url
def find_originators_for_phrase(phrase, dom_hist_phrase, thresh):
    originators = {}
    dom_set = functools.reduce(lambda x, y: x | y,
                               (set(d) for d in dom_hist_phrase.values()))
    thresh_ct = int(math.ceil(thresh * len(dom_set)))
    if thresh_ct == 0:
        thresh_ct = 1
    #print("%s: %d" % (phrase, thresh_ct), end='')
    doms_so_far = set()
    for ys in util.iter_yearseason():
        doms_so_far |= set(dom_hist_phrase[ys])
        if len(doms_so_far) >= thresh_ct:
            #print(" %s" % len(doms_so_far))
            break
    for d in doms_so_far:
        if d not in originators:
            originators[d] = []
        originators[d].append((phrase, doms_so_far, dom_set, ys))
    return originators
예제 #7
0
def load_top_grams(n, domain_norm_factor, policy_norm_factor, domain_counts):    
    #Step 1 -- filter top phrases
    top_phrases = set()

    tp_cutoffs = {ys: dc * 0.005 for ys, dc in domain_counts.items()}
    limit=min(tp_cutoffs.values())
    for yearseason,t in util.load_grams_parallel(n,limit=limit):
        if t[0] >= tp_cutoffs[yearseason]:
            top_phrases.add(t[1])
    
    print("\033[K%s-grams: Identified %d top phrases" % (n,len(top_phrases)))

    #Sanity check
    if len(top_phrases) == 0:
        sys.stderr.write("Found no top phrases for grams: %s\n" % str(n))
        return None

    #Step 2 -- find counts for top phrases
    gram_freq_by_countfxn = {count_name:{s:[0] * num_intervals for s in top_phrases} for count_name in util.count_fxns}
    gram_freq_by_countfxn["raw"]={s:[[] for _ in range(num_intervals)]  for s in top_phrases}
    ys_idx_map = {ys:idx for idx,ys in enumerate(list(util.iter_yearseason()))}
    ct = 0
    for yearseason, (counts,s,*doms) in util.load_grams_parallel(n,search_for=frozenset(top_phrases),recount=True,domain_norm_factor=domain_norm_factor,policy_norm_factor=policy_norm_factor):
        #s=t[1]
        
        if ct % 100 == 0:
            print("\033[KLoaded %d phrase-intervals" % ct,end='\r')
        ct += 1
        
        #doms = t[2:]
        ys_idx = ys_idx_map[yearseason]

        gram_freq_by_countfxn["raw"][s][ys_idx] = doms

        for count_name, count in counts.items():
            gram_freq_by_countfxn[count_name][s][ys_idx] = count
        #Iterate counting methods i.e. unique, total, or alexa-weighted domains
        #for count_name, (countf, count_friendly_name) in util.count_fxns.items(): #FIXME
        #    gram_freq_by_countfxn[count_name][s][ys_idx] = countf(doms,yearseason=yearseason,domain_norm_factor=domain_norm_factor,policy_norm_factor=policy_norm_factor)
            
    
    print("\033[KData loaded",end="\r")
    return gram_freq_by_countfxn
예제 #8
0
def draw_lines(filename, topX, freq_data=None):
    """
    Frequency data is an array of tuples. The first item is the phrase,
    the second item is the number of occurances at each interval
    """

    opts = np.get_printoptions()
    np.set_printoptions(threshold=sys.maxsize)

    fn, url = util.get_web_fn('graphs', "lines",
                              '%s.html' % os.path.basename(filename))
    if os.path.exists(fn):
        #        print("Skipping line drawing")
        return url

    yss = [ys for ys in util.iter_yearseason()]

    if freq_data is None:
        freq_data = get_frequency_data(filename, maxCount=topX)

    freq_data = list(freq_data)

    fig = go.Figure()

    #    layout=go.Layout(
    #            legend=dict(x=-.1, y=-0.5*(len(freq_data) // 10))
    #        )

    for label, data in freq_data:
        #wrappedlabel = textwrap.fill(label, 40)
        if len(label) >= 40:
            wrappedlabel = label[:37] + "..."
        else:
            wrappedlabel = label
        fig.add_trace(
            go.Scatter(x=yss, y=data, mode='lines', name=wrappedlabel))
    plotly.offline.plot(fig, filename=fn, auto_open=False)
    np.set_printoptions(opts)
    return url
예제 #9
0
                print("\033[K", end="\r")
                logging.info("\tDone dumping data")
                mem_count()

            webf.write('</tr>')
            util.close_pool()

        webf.write("""
</table>
</div>
</body>
        </html>""")
        logging.info("Done")


num_intervals = len(list(util.iter_yearseason()))


def main():
    global start, stop, topN, SENTENCES, WORDS, ENTITIES, ys_list

    if MEM_DEBUG:
        tracemalloc.start(10)

    parser = argparse.ArgumentParser(
        description='Runs basic analytics over pre-sorted n-grams')
    parser.add_argument(dest="N",
                        type=int,
                        help='Number of output phrases per metric')
    parser.add_argument(
        dest="grams",
예제 #10
0
def create_data_dump(n,
                     folders,
                     basename,
                     groups,
                     gram_freq,
                     gram_freq_raw,
                     score_name_str,
                     scores=None,
                     gen_figs=False):
    """
    Take the given phrases and create:
    - A table of phrase, score (if available), occurances
    - Line graphs for phrase occurance
    - Heatmaps for phrases
    """
    nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename))
    group_to_graph = {}
    hit_to_graph = {}

    #Part one -- create CSV data files, heatmaps, and line graphs
    for group_num in range(len(groups)):
        group_hits = groups[group_num]
        nfn = os.path.join(util.OUT_DIR, "%s_%d.csv" % (basename, group_num))
        if gen_figs:
            #Line graph
            line_url = draw_lines.draw_lines(nfn, -1, ((hit, gram_freq[hit])
                                                       for hit in group_hits))
            logging.info("\t\tDone drawling lines for group %d" % (group_num))
            #Heatmp
            if n == 'w':
                html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls(
                    n, group_hits)
            else:
                heatmap_urls, html_heatmap_urls = draw_heatmap.draw_html_and_reg_heatmaps(
                    n, group_hits, gram_freq_raw)
            logging.info("\033[K\t\tDone drawing heatmaps for group %d" %
                         (group_num))
        else:  #Skip drawing
            line_url = draw_lines.get_lines_url(nfn, -1)
            heatmap_urls = draw_heatmap.get_multiple_heatmap_urls(
                n, group_hits)
            html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls(
                n, group_hits)
        group_to_graph[group_num] = line_url
        with open(nfn, "w+") as f:  #Write CSV data file for later use
            writer = csv.writer(f)
            for hit in group_hits:
                vals_str = ["%0.2f" % f for f in gram_freq[hit]]
                writer.writerow([
                    hit.replace("\\", "\\\\").replace("\n", "\\n"), *vals_str
                ])
                #heatmap_url = heatmap_urls[hit]
                heatmap_url = html_heatmap_urls[hit]
                #                print(heatmap_url)
                hit_to_graph[hit] = heatmap_url
        logging.info("\t\tDone drawing group %d" % (group_num))

    #Part two -- create usable web page to explore
    with open(nwebfn, "w+") as nwebf:
        nwebf.write("""<html>
<head>
<style>
table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
</style>
</head>
<body>
""")
        pool = util.get_pool()
        all_hits = list(
            itertools.chain(*[[hit for hit in group] for group in groups]))

        plots = pool.starmap(miniplot.get_plot_as_img,
                             [(ys_list, gram_freq[hit]) for hit in all_hits])
        plots = list(plots)
        logging.info("\t\tDone drawing plots")

        #Break up by group
        pNum = 0
        for group, gid in zip(groups, range(len(groups))):
            if len(group) == 0: continue
            line_graph_url = group_to_graph[gid]
            nwebf.write('<h3>%s</h3>' % (score_name_str))
            nwebf.write(
                '<h4><a href="%s" target="_blank">Line graph for group %d</a></h4>\n'
                % (line_graph_url, gid))
            nwebf.write('<table>\n')
            headers = ["Score", "Phrase", "Bar Plot"] + list(
                util.iter_yearseason())
            if scores is None:
                headers = headers[1:]
            nwebf.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers))
            #Then phrase
            for hit in group:
                nwebf.write('<tr>\n')
                if scores is not None:
                    if abs(scores[hit]) < 0.01 and scores[hit] != 0:
                        nwebf.write('<td>%0.2E</td>' % scores[hit])
                    else:
                        nwebf.write('<td>%0.2f</td>' % scores[hit])
                nwebf.write('<td width="30%">\n')
                #vals_str = ["%0.2f" % f for f in gram_freq[hit]]
                vals_str = [
                    ("%0.2E" if
                     (abs(gram_freq[hit][ysid]) != 0
                      and abs(gram_freq[hit][ysid]) < 0.01) else "%0.2f") %
                    (gram_freq[hit][ysid]) for ysid in range(num_intervals)
                ]

                heatmap_graph_url = hit_to_graph[hit]
                hit_link = '<a href="%s" target="_blank">%s</a>' % (
                    heatmap_graph_url, hit.replace("\\", "\\\\").replace(
                        "\n", "\\n"))

                plot = plots[
                    pNum]  # miniplot.get_plot_as_img(ys_list,gram_freq[hit])
                pNum += 1

                nwebf.write('</td><td>'.join([hit_link, plot,
                                              *vals_str]))  #TODO add miniplot
                nwebf.write('</td></tr>\n')
            logging.info("\t\tDone with table for group %d" % (gid))

            nwebf.write('</table>\n')
        nwebf.write("</body></html>")
    mem_count()
    return nwebfn, nurl
def get_num_policy_families():
    for ys in util.iter_yearseason():
        util.__init_domains_cache(ys)
        yield len(util.domainsets_cache[ys])
def get_largest_families():
    for ys in util.iter_yearseason():
        util.__init_domains_cache(ys)
        yield max((len(s) for s in util.domainsets_cache[ys].values()))
예제 #13
0
def get_plot_as_img(x_vals, y_vals):
    # metric_values should be a dict where keys are interval names (2019A)
    # and values are metric values
    buf = BytesIO()
    plt.figure(figsize=(4, 1.5))
    x_labels = x_vals
    x_vals = np.arange(float(len(x_labels)))
    fig = sns.lineplot(x=x_vals, y=y_vals, color="#666666")
    plt.xticks(x_vals)
    fig.set_xlim((0, len(x_vals) - 1))
    #fig.set_yticklabels(fig.get_yticklabels(), fontsize='x-small')
    fig.set_xticklabels(x_labels,
                        rotation=90,
                        horizontalalignment='center',
                        fontsize='x-small')
    plt.savefig(buf, format="png", bbox_inches='tight')
    buf.seek(0)
    plt.close()
    data = base64.b64encode(buf.getbuffer()).decode("ascii")
    return f"<img src='data:image/png;base64,{data}'/>"


if __name__ == "__main__":
    import historical.util as util
    data = [
        0.00, 0.00, 0.00, 0.00, 0.03, 0.02, 0.06, 0.07, 0.07, 0.10, 0.13, 0.12,
        0.10, 0.15, 0.12, 0.13, 0.06, 0.14, 0.10, 0.10, 0.07, 0.14
    ]
    ys = list(util.iter_yearseason())
    print(get_plot_as_img(ys, data))
def find_originators(n,
                     metric,
                     thresh,
                     score_by=1,
                     randomize=False,
                     prune=False):
    global dom_hist_cache
    if PARALLEL:
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() // 4)
    if metric == "top":
        top_phrases = set()
        for yearseason in util.iter_yearseason():
            m = -1
            for t in util.load_grams(n, yearseason):
                num = int(t[0])
                if num < 10:
                    break
                top_phrases.add(t[1])
        phrases = list(top_phrases)
        del top_phrases
    else:
        phrases = list(zip(*util.load_top_phrases(n, t=metric)))[0]
    if randomize and dom_hist_cache is not None:  #We don't need to refetch the history for multiple runs
        dom_hist = dom_hist_cache
    else:
        if False:
            scan_for_phrase_doms_n_fixed = functools.partial(
                scan_for_phrase_doms, n=n)
            BATCH_SIZE = 10
            batches = (phrases[i:i + BATCH_SIZE]
                       for i in range(0, len(phrases), BATCH_SIZE))
            dom_hist = {}
            dh_batches = pool.map(scan_for_phrase_doms_n_fixed, batches)
            for dh in dh_batches:
                dom_hist.update(dh)
        else:
            dom_hist = scan_for_phrase_doms(phrases, n)
    if randomize:
        dom_hist_cache = dom_hist  #Don't save history unless we need to
        dom_hist = randomize_dom_hist(dom_hist)
    originators = {}
    for phrase in dom_hist:
        if PARALLEL:
            break
        originators_changes = find_originators_for_phrase(
            phrase, dom_hist[phrase], thresh)
        for d in originators_changes:
            if d not in originators:
                originators[d] = originators_changes[d]
            else:
                originators[d] += originators_changes[d]
    if PARALLEL:
        for originators_changes in pool.map(_find_originators_for_phrase,
                                            ((phrase, dom_hist[phrase], thresh)
                                             for phrase in dom_hist)):
            for d in originators_changes:
                if d not in originators:
                    originators[d] = originators_changes[d]
                else:
                    originators[d] += originators_changes[d]
    if (prune):
        prune_overlapping(originators)
    originators_l = []
    for d, l in originators.items():
        if score_by == 0:
            score = len(l)
        elif score_by == 1:
            score = sum([len(t[2]) / len(t[1]) for t in l])
        else:
            raise Exception("Unrecognized metric %d" % score_by)
        originators_l.append((d, score, l))
    return sorted(list(originators_l), key=lambda x: -x[1])
def get_total_policies_in_families():
    for ys in util.iter_yearseason():
        util.__init_domains_cache(ys)
        yield sum((len(s) for s in util.domainsets_cache[ys].values()))
예제 #16
0
def main():
    global start, stop, topN, SENTENCES, WORDS, ENTITIES, ys_list

    if MEM_DEBUG:
        tracemalloc.start(10)

    parser = argparse.ArgumentParser(
        description='Runs basic analytics over pre-sorted n-grams')
    parser.add_argument(dest="N",
                        type=int,
                        help='Number of output phrases per metric')
    parser.add_argument(
        dest="grams",
        type=str,
        nargs="+",
        help="Gram types to include. Numerical or any of 'emnsuw'")
    parser.add_argument('-s',
                        dest="sentences",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine sentences')
    parser.add_argument('-w',
                        dest="words",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine words')
    parser.add_argument('-e',
                        dest="entities",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine entities')

    util.add_arguments(parser)

    args = parser.parse_args()

    #Arguments:
    #analytics.py <MIN> <MAX> <N> (sw)
    #Finds the top N n-grams for each n \in [MIN .. MAX]
    #"s" in the last argument indicates including sentences, "w" words. Blank for nothing
    topN = args.N

    gram_list = []
    for gram in args.grams:
        try:
            #If it's an integer, add it
            gram_list.append(int(gram))
        except:
            for char in list(gram):
                if char not in "swenum":
                    raise Exception("Illegal gram: %s" % char)
                gram_list.append(char)

    util.process_arguments(args)
    util.CACHE_DB = False

    ys_list = list(util.iter_yearseason())

    do_analytics(gram_list)
예제 #17
0
def load_top_grams(n, domain_norm_factor, policy_norm_factor, domain_counts):
    #Step 1 -- filter top phrases
    top_phrases = set()

    #if util.DO_SAMPLE:
    #    tp_cutoffs = {ys: dc * 0.01 for ys, dc in domain_counts.items()}
    #else:
    #    tp_cutoffs = {ys: dc * 0.005 for ys, dc in domain_counts.items()}
    if util.DO_SAMPLE:
        tp_cutoffs = {ys: 0.01 for ys, dc in domain_counts.items()}
    else:
        tp_cutoffs = {ys: 0.005 for ys, dc in domain_counts.items()}
    limit = min(tp_cutoffs.values())
    for yearseason, t in ioutils.load_grams_parallel(
            n,
            recount=True,
            limit=limit,
            domain_norm_factor=domain_norm_factor,
            policy_norm_factor=policy_norm_factor):
        if t[0]["unique"] >= tp_cutoffs[yearseason]:
            top_phrases.add(t[1])

    logging.info("\t%s-grams: Identified %d top phrases" %
                 (n, len(top_phrases)))
    mem_count()

    #Sanity check
    if len(top_phrases) == 0:
        logging.error("Found no top phrases for grams: %s\n" % str(n))
        return None

    #Step 2 -- find counts for top phrases
    gram_freq_by_countfxn = {
        count_name: {s: [0] * num_intervals
                     for s in top_phrases}
        for count_name in util.count_fxns
    }
    gram_freq_by_countfxn["raw"] = {
        s: [[] for _ in range(num_intervals)]
        for s in top_phrases
    }
    ys_idx_map = {
        ys: idx
        for idx, ys in enumerate(list(util.iter_yearseason()))
    }
    ct = 0
    for yearseason, (counts, s, *doms) in ioutils.load_grams_parallel(
            n,
            search_for=frozenset(top_phrases),
            recount=True,
            domain_norm_factor=domain_norm_factor,
            policy_norm_factor=policy_norm_factor):
        if ct % 100 == 0:
            print("\tLoaded %d phrase-intervals" % ct, end='\r')
        ct += 1

        ys_idx = ys_idx_map[yearseason]

        gram_freq_by_countfxn["raw"][s][ys_idx] = doms

        for count_name, count in counts.items():
            gram_freq_by_countfxn[count_name][s][ys_idx] = count

    print("\033[K", end="\r")
    logging.info("\tData loaded")
    mem_count()
    return gram_freq_by_countfxn
예제 #18
0
def do_analytics(gram_list):
    global SLOPE_COUNT_THRESH, SLOPE_PSEUDOCOUNT

    webfn, weburl = util.get_web_fn("raw", "analytics.html")
    logging.info("Access at %s" % weburl)

    #logging.info("Pre-loading Alexa results")
    #alexa.load()

    logging.info("Starting Counting intervals")
    num_intervals = len(list(util.iter_yearseason()))
    logging.info("Starting counting policies")
    domain_counts, policy_counts = get_policy_counts()

    average_policy_count = sum(policy_counts.values()) / len(policy_counts)
    policy_norm_factor = {
        yearseason: 1 / policy_counts[yearseason]
        for yearseason in util.iter_yearseason()
    }

    average_domain_count = sum(domain_counts.values()) / len(domain_counts)
    domain_norm_factor = {
        yearseason: 1 / domain_counts[yearseason]
        for yearseason in util.iter_yearseason()
    }

    with open(webfn, "w+") as webf:
        webf.write("""
<html>
    <head>
        <link rel="stylesheet" type="text/css" href="/styles/style.css"/>
    </head>
<body>
""")
        webf.write("""

<div class="query">
<iframe id="queryframe" src="https://privacypolicies.cs.princeton.edu/search_policies.php?basedir=%s">
</iframe>
</div>

<div class="metricsd"><table class="metricst">
        """ % util.testing_prefix)

        #Iterate phrase type
        for n in gram_list:
            ngram_name = "%s-grams" % n if n not in friendly_names else friendly_names[
                n]
            webf.write("""
<tr class="nhtr">
<td><h3>Data for %s</h3></td>
</tr>
<tr class="ntr">""" % ngram_name)
            logging.info("Loading top grams for %s-grams" % (n))
            gram_freq_by_countfxn = load_top_grams(n, domain_norm_factor,
                                                   policy_norm_factor,
                                                   domain_counts)
            logging.info("Done loading top grams for %s-grams" % (n))
            mem_trace()
            if gram_freq_by_countfxn is None:
                logging.error("No grams")
                continue
            gram_freq_raw = gram_freq_by_countfxn["raw"]
            #Iterate counting methods i.e. unique, total, or alexa-weighted domains
            for count_name, (countf,
                             count_friendly_name) in util.count_fxns.items():
                logging.info("\tBeginning scoring with %s" % (count_name))
                #Score based on various metrics
                gram_freq = gram_freq_by_countfxn[count_name]

                base_count = None
                if count_name == "total":
                    base_count = 1 / average_domain_count
                elif count_name == "unique":
                    base_count = 1 / average_policy_count
                elif count_name == "alexa":
                    base_count = alexa.average_traffic
                slope_count_thresh = get_slope_thresh(gram_freq, base_count)
                slope_pseudocount = base_count
                logging.info("\tSlope thresh for round is: %0.4E" %
                             slope_count_thresh)
                logging.info("\tSlope pseudocount for round is: %0.4E" %
                             slope_pseudocount)
                gram_scores = {mname: [] for mname in metrics}

                #Identifying phrases of interest based on metrics & rules
                for s in gram_freq:

                    vals = gram_freq[s]
                    for (mname, (score_fxn, hname)) in metrics.items():
                        heap = gram_scores[mname]
                        score = score_fxn(vals, slope_pseudocount,
                                          slope_count_thresh)
                        if score == -100000: continue
                        if len(heap) >= topN:
                            heapq.heappushpop(heap, (score, s))
                        else:
                            heapq.heappush(heap, (score, s))

                logging.info("\tDone scoring")
                webf.write('<td class="dataCb">\n')
                webf.write("<h4>Counted by %s</h4>\n" % (count_friendly_name))

                score_name_str = "Top %s by %%s counted by %s</h4>\n" % (
                    ngram_name, count_friendly_name)

                #Dump data
                for mname, (fsc, hname) in metrics.items():
                    heap = gram_scores[mname]
                    logging.info("\tSorting top values for %s" % (mname))

                    #Heaps aren't sorted, we need to sort the heap
                    #Taking advantage of the heap structure doesn't help us here... pop is log(n), and we need n iterations
                    heap = sorted(heap, reverse=True)
                    phrases = [s for sc, s in heap]

                    groups = [
                        phrases[10 * i:min(10 * (i + 1), len(phrases))]
                        for i in range(math.ceil(len(phrases) / 10))
                    ]
                    scores = {s: sc for sc, s in heap}

                    logging.info("\tStarting data dump for %s" % (mname))
                    nwebfn, nurl = create_data_dump(
                        n, ["metrics", mname],
                        "%s-grams_top_%s_%s%s" %
                        (n, mname, count_name, util.get_file_suffix()),
                        groups,
                        gram_freq,
                        gram_freq_raw,
                        score_name_str % hname,
                        scores=scores)
                    webf.write(
                        '<span style="margin-left:2em"><a href="%s" target="_blank">%s</a></span></br>\n'
                        % (nurl, hname))
                    print("\033[K\t\tGraphs created", end="\r")
                    webf.flush()

                webf.write('<br/></td>\n')

                #Best to force a flush for partial readouts
                webf.flush()
                os.fsync(webf)
                mem_trace()
                print("\033[K", end="\r")
                logging.info("\tDone dumping data")
                mem_count()

            webf.write('</tr>')
            util.close_pool()

        webf.write("""
</table>
</div>
</body>
        </html>""")
        logging.info("Done")
예제 #19
0
        n = len(phrases[0].split(' '))
    counts = {phrase:{ys:0 for ys in util.iter_yearseason()} for phrase in phrases}
    for ys in util.iter_yearseason():
        unmet_phrases = set(phrases)
        count_adjust = norm_params[ys]
        for row in util.load_grams(n, ys):
            rphrase=row[1]
            for phrase in unmet_phrases:
                if rphrase == phrase:
                    unmet_phrases.remove(phrase)
                    count=row[0]
                    counts[phrase][ys]=int(count)/count_adjust
                    break #Only one phrase can match
            if len(unmet_phrases) == 0:
                break #No more phrases to be found this interval
    return counts

if __name__ == "__main__":
    #TODO argument parsing
    phrase = sys.argv[1]
    tags = sys.argv[2]
    norm_params = dict(zip([ys for ys in util.iter_yearseason()],[float(i) for i in sys.argv[3].split(' ')]))
    if len(sys.argv) == 5:
        n_override = sys.argv[4]
    else:
        n_override = None
    res = scan(phrase,norm_params,n_override=n_override)
    for phrase in res:
        print("%s,%s" % (phrase, ",".join([str(res[phrase][ys]) for ys in util.iter_yearseason()])))
            
예제 #20
0
            write_obj.append({"id": i, "domains": l})
            i += 1
        json.dump(write_obj, f)

def make_dirs(intervals):
    for ys in intervals:
        Path("../data/text_sim/%s/" % ys).mkdir(parents=True, exist_ok=True)

    

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Find and flag duplicate documents')

    parser.add_argument('intervals', type=str, nargs='+',
                                            help='Which intervals to process over. "all" scans all intervals in sequence')
    parser.add_argument('--sample', dest="sample_fdd", action='store_const', const=True, default=False, help='Sample')

    util.add_arguments(parser)
    args = parser.parse_args()
    intervals = args.intervals
    SAMPLE = args.sample_fdd
    util.process_arguments(args)
    if intervals[0] == "all":
        intervals = list(util.iter_yearseason())

    make_dirs(intervals)
        
    for interval in intervals:
        make_textsim_graph(interval)
    util.close_pool()
예제 #21
0
import plotly
import math

#WEB_PREFIX = "https://cs.princeton.edu/~rbamos"
#WEB_DIR = os.path.expanduser("~/public_html/")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("""Usage:
python3 -m historical.draw_freq_dist <n>""")
        exit(-1)
    n = sys.argv[1]

    np.set_printoptions(threshold=sys.maxsize)

    yss = [ys for ys in util.iter_yearseason()]

    fig = go.Figure()

    b = 2
    max_count = 0
    for ys in yss:
        freqs = [r[0] for r in util.load_grams(n, ys)]
        max_count = max(max_count, len(freqs))
    log_index = [
        int(math.pow(b, i)) for i in range(int(math.log(max_count, b)))
    ]
    for ys in yss:
        freqs = [r[0] for r in util.load_grams(n, ys)]
        freqs2 = [freqs[i] if i < len(freqs) else 0 for i in log_index]
        fig.add_trace(go.Scatter(x=log_index, y=freqs2, mode='lines', name=ys))