def scan(phrase, norm_params, n_override=None): phrases = None if type(phrase) is list or type(phrase) is tuple: phrases = phrase else: phrases = [phrase] if n_override is not None: n = n_override else: n = len(phrases[0].split(' ')) counts = {phrase:{ys:0 for ys in util.iter_yearseason()} for phrase in phrases} for ys in util.iter_yearseason(): unmet_phrases = set(phrases) count_adjust = norm_params[ys] for row in util.load_grams(n, ys): rphrase=row[1] for phrase in unmet_phrases: if rphrase == phrase: unmet_phrases.remove(phrase) count=row[0] counts[phrase][ys]=int(count)/count_adjust break #Only one phrase can match if len(unmet_phrases) == 0: break #No more phrases to be found this interval return counts
def draw_html_heatmap_from_data(n, phrase, ar, l): opts = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) ar, l = sort_row_data(ar, l) ar = list(reversed(ar)) l = list(reversed(l)) fn, url = util.get_web_fn("graphs", "html_heatmaps", slugify(phrase)) yss = [ys for ys in util.iter_yearseason()] if len(ar) == 0: sys.err.writeln("%s,%s has no values\n" % (n, phrase)) return url with open(fn, "w+") as f: f.write("""<html> <head> <style> table, th, td { border: 1px solid black; border-collapse: collapse; } tr:nth-child(even) {background-color: #d2d2d2; </style> </head> <body> """) f.write('<h3>Phrase: %s</h3>' % (phrase, )) f.write( '<h4><a href="%s" target="_blank">Graphical Heatmap</a></h4>\n' % (get_heatmap_url(n, phrase), )) f.write('<table>\n') headers = ["Domain"] + list(util.iter_yearseason()) f.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers)) for domId in range(len(ar)): f.write('<tr>\n') f.write('<td>%s</td>\n' % l[domId]) for ysId in range(len(ar[0])): if ar[domId][ysId] == 1: policy_url = "https://privacypolicies.cs.princeton.edu/fetch_policy.php?domain=%s&interval=%s_%s" % ( l[domId], yss[ysId][:4], yss[ysId][4]) #Policy URL f.write('<td><a href="%s">%s</a></td>\n' % (policy_url, yss[ysId])) else: f.write('<td></td>\n') f.write('</table>\n') f.write("</body></html>") np.set_printoptions(opts) return url
def scan_for_phrase_doms(phrase, n): phrases = None if type(phrase) is list or type(phrase) is tuple: phrases = phrase else: phrases = [phrase] dom_hist = { phrase: {ys: [] for ys in util.iter_yearseason()} for phrase in phrases } for ys in util.iter_yearseason(): unmet_phrases = set(phrases) for row in util.load_grams(n, ys): rphrase = row[1] if rphrase in unmet_phrases: unmet_phrases.remove(rphrase) doms = row[2:] dom_hist[rphrase][ys] = doms if len(unmet_phrases) == 0: break #No more phrases to be found this interval return dom_hist
def load_grams_parallel(n, limit=None, search_for=None, recount=False, domain_norm_factor=None, policy_norm_factor=None): util.get_blacklist() #Ensure this is loaded first def queue_grams(q, n, yearseas, limit, search_for): for row in load_grams(n, yearseas, limit=limit): if search_for is not None: s = row[1] if s not in search_for: continue if recount: counts = {} doms = row[2:] for count_name, ( countf, count_friendly_name) in util.count_fxns.items(): counts[count_name] = countf( doms, yearseason=yearseas, domain_norm_factor=domain_norm_factor, policy_norm_factor=policy_norm_factor) row = (counts, *row[1:]) q.put((yearseas, row)) q = mp.Queue() procs = [] queuefxn = queue_grams for yearseas in util.iter_yearseason(): p = mp.Process(target=queuefxn, args=(q, n, yearseas, limit, search_for)) p.start() procs.append(p) while any((p.is_alive() for p in procs)): try: yield q.get(True, 10) except queue.Empty: pass #if any((p.is_alive() for p in procs)): #sys.stderr.write("Unexpected empty queue. Trying again.\n") q.close()
def draw_heatmap_from_data(n, phrase, ar, l): opts = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) ar, l = sort_row_data(ar, l) fn, url = util.get_web_fn("graphs", "heatmaps", slugify(phrase)) yss = [ys for ys in util.iter_yearseason()] fig = go.Figure(data=go.Heatmap(z=ar, y=l, x=yss, colorscale=[(0, "#d0d3d4"), (1, "#1a5276")], showscale=False)) plotly.offline.plot(fig, filename=fn, auto_open=False) np.set_printoptions(opts) return url
def find_originators_for_phrase(phrase, dom_hist_phrase, thresh): originators = {} dom_set = functools.reduce(lambda x, y: x | y, (set(d) for d in dom_hist_phrase.values())) thresh_ct = int(math.ceil(thresh * len(dom_set))) if thresh_ct == 0: thresh_ct = 1 #print("%s: %d" % (phrase, thresh_ct), end='') doms_so_far = set() for ys in util.iter_yearseason(): doms_so_far |= set(dom_hist_phrase[ys]) if len(doms_so_far) >= thresh_ct: #print(" %s" % len(doms_so_far)) break for d in doms_so_far: if d not in originators: originators[d] = [] originators[d].append((phrase, doms_so_far, dom_set, ys)) return originators
def load_top_grams(n, domain_norm_factor, policy_norm_factor, domain_counts): #Step 1 -- filter top phrases top_phrases = set() tp_cutoffs = {ys: dc * 0.005 for ys, dc in domain_counts.items()} limit=min(tp_cutoffs.values()) for yearseason,t in util.load_grams_parallel(n,limit=limit): if t[0] >= tp_cutoffs[yearseason]: top_phrases.add(t[1]) print("\033[K%s-grams: Identified %d top phrases" % (n,len(top_phrases))) #Sanity check if len(top_phrases) == 0: sys.stderr.write("Found no top phrases for grams: %s\n" % str(n)) return None #Step 2 -- find counts for top phrases gram_freq_by_countfxn = {count_name:{s:[0] * num_intervals for s in top_phrases} for count_name in util.count_fxns} gram_freq_by_countfxn["raw"]={s:[[] for _ in range(num_intervals)] for s in top_phrases} ys_idx_map = {ys:idx for idx,ys in enumerate(list(util.iter_yearseason()))} ct = 0 for yearseason, (counts,s,*doms) in util.load_grams_parallel(n,search_for=frozenset(top_phrases),recount=True,domain_norm_factor=domain_norm_factor,policy_norm_factor=policy_norm_factor): #s=t[1] if ct % 100 == 0: print("\033[KLoaded %d phrase-intervals" % ct,end='\r') ct += 1 #doms = t[2:] ys_idx = ys_idx_map[yearseason] gram_freq_by_countfxn["raw"][s][ys_idx] = doms for count_name, count in counts.items(): gram_freq_by_countfxn[count_name][s][ys_idx] = count #Iterate counting methods i.e. unique, total, or alexa-weighted domains #for count_name, (countf, count_friendly_name) in util.count_fxns.items(): #FIXME # gram_freq_by_countfxn[count_name][s][ys_idx] = countf(doms,yearseason=yearseason,domain_norm_factor=domain_norm_factor,policy_norm_factor=policy_norm_factor) print("\033[KData loaded",end="\r") return gram_freq_by_countfxn
def draw_lines(filename, topX, freq_data=None): """ Frequency data is an array of tuples. The first item is the phrase, the second item is the number of occurances at each interval """ opts = np.get_printoptions() np.set_printoptions(threshold=sys.maxsize) fn, url = util.get_web_fn('graphs', "lines", '%s.html' % os.path.basename(filename)) if os.path.exists(fn): # print("Skipping line drawing") return url yss = [ys for ys in util.iter_yearseason()] if freq_data is None: freq_data = get_frequency_data(filename, maxCount=topX) freq_data = list(freq_data) fig = go.Figure() # layout=go.Layout( # legend=dict(x=-.1, y=-0.5*(len(freq_data) // 10)) # ) for label, data in freq_data: #wrappedlabel = textwrap.fill(label, 40) if len(label) >= 40: wrappedlabel = label[:37] + "..." else: wrappedlabel = label fig.add_trace( go.Scatter(x=yss, y=data, mode='lines', name=wrappedlabel)) plotly.offline.plot(fig, filename=fn, auto_open=False) np.set_printoptions(opts) return url
print("\033[K", end="\r") logging.info("\tDone dumping data") mem_count() webf.write('</tr>') util.close_pool() webf.write(""" </table> </div> </body> </html>""") logging.info("Done") num_intervals = len(list(util.iter_yearseason())) def main(): global start, stop, topN, SENTENCES, WORDS, ENTITIES, ys_list if MEM_DEBUG: tracemalloc.start(10) parser = argparse.ArgumentParser( description='Runs basic analytics over pre-sorted n-grams') parser.add_argument(dest="N", type=int, help='Number of output phrases per metric') parser.add_argument( dest="grams",
def create_data_dump(n, folders, basename, groups, gram_freq, gram_freq_raw, score_name_str, scores=None, gen_figs=False): """ Take the given phrases and create: - A table of phrase, score (if available), occurances - Line graphs for phrase occurance - Heatmaps for phrases """ nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename)) group_to_graph = {} hit_to_graph = {} #Part one -- create CSV data files, heatmaps, and line graphs for group_num in range(len(groups)): group_hits = groups[group_num] nfn = os.path.join(util.OUT_DIR, "%s_%d.csv" % (basename, group_num)) if gen_figs: #Line graph line_url = draw_lines.draw_lines(nfn, -1, ((hit, gram_freq[hit]) for hit in group_hits)) logging.info("\t\tDone drawling lines for group %d" % (group_num)) #Heatmp if n == 'w': html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls( n, group_hits) else: heatmap_urls, html_heatmap_urls = draw_heatmap.draw_html_and_reg_heatmaps( n, group_hits, gram_freq_raw) logging.info("\033[K\t\tDone drawing heatmaps for group %d" % (group_num)) else: #Skip drawing line_url = draw_lines.get_lines_url(nfn, -1) heatmap_urls = draw_heatmap.get_multiple_heatmap_urls( n, group_hits) html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls( n, group_hits) group_to_graph[group_num] = line_url with open(nfn, "w+") as f: #Write CSV data file for later use writer = csv.writer(f) for hit in group_hits: vals_str = ["%0.2f" % f for f in gram_freq[hit]] writer.writerow([ hit.replace("\\", "\\\\").replace("\n", "\\n"), *vals_str ]) #heatmap_url = heatmap_urls[hit] heatmap_url = html_heatmap_urls[hit] # print(heatmap_url) hit_to_graph[hit] = heatmap_url logging.info("\t\tDone drawing group %d" % (group_num)) #Part two -- create usable web page to explore with open(nwebfn, "w+") as nwebf: nwebf.write("""<html> <head> <style> table, th, td { border: 1px solid black; border-collapse: collapse; } </style> </head> <body> """) pool = util.get_pool() all_hits = list( itertools.chain(*[[hit for hit in group] for group in groups])) plots = pool.starmap(miniplot.get_plot_as_img, [(ys_list, gram_freq[hit]) for hit in all_hits]) plots = list(plots) logging.info("\t\tDone drawing plots") #Break up by group pNum = 0 for group, gid in zip(groups, range(len(groups))): if len(group) == 0: continue line_graph_url = group_to_graph[gid] nwebf.write('<h3>%s</h3>' % (score_name_str)) nwebf.write( '<h4><a href="%s" target="_blank">Line graph for group %d</a></h4>\n' % (line_graph_url, gid)) nwebf.write('<table>\n') headers = ["Score", "Phrase", "Bar Plot"] + list( util.iter_yearseason()) if scores is None: headers = headers[1:] nwebf.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers)) #Then phrase for hit in group: nwebf.write('<tr>\n') if scores is not None: if abs(scores[hit]) < 0.01 and scores[hit] != 0: nwebf.write('<td>%0.2E</td>' % scores[hit]) else: nwebf.write('<td>%0.2f</td>' % scores[hit]) nwebf.write('<td width="30%">\n') #vals_str = ["%0.2f" % f for f in gram_freq[hit]] vals_str = [ ("%0.2E" if (abs(gram_freq[hit][ysid]) != 0 and abs(gram_freq[hit][ysid]) < 0.01) else "%0.2f") % (gram_freq[hit][ysid]) for ysid in range(num_intervals) ] heatmap_graph_url = hit_to_graph[hit] hit_link = '<a href="%s" target="_blank">%s</a>' % ( heatmap_graph_url, hit.replace("\\", "\\\\").replace( "\n", "\\n")) plot = plots[ pNum] # miniplot.get_plot_as_img(ys_list,gram_freq[hit]) pNum += 1 nwebf.write('</td><td>'.join([hit_link, plot, *vals_str])) #TODO add miniplot nwebf.write('</td></tr>\n') logging.info("\t\tDone with table for group %d" % (gid)) nwebf.write('</table>\n') nwebf.write("</body></html>") mem_count() return nwebfn, nurl
def get_num_policy_families(): for ys in util.iter_yearseason(): util.__init_domains_cache(ys) yield len(util.domainsets_cache[ys])
def get_largest_families(): for ys in util.iter_yearseason(): util.__init_domains_cache(ys) yield max((len(s) for s in util.domainsets_cache[ys].values()))
def get_plot_as_img(x_vals, y_vals): # metric_values should be a dict where keys are interval names (2019A) # and values are metric values buf = BytesIO() plt.figure(figsize=(4, 1.5)) x_labels = x_vals x_vals = np.arange(float(len(x_labels))) fig = sns.lineplot(x=x_vals, y=y_vals, color="#666666") plt.xticks(x_vals) fig.set_xlim((0, len(x_vals) - 1)) #fig.set_yticklabels(fig.get_yticklabels(), fontsize='x-small') fig.set_xticklabels(x_labels, rotation=90, horizontalalignment='center', fontsize='x-small') plt.savefig(buf, format="png", bbox_inches='tight') buf.seek(0) plt.close() data = base64.b64encode(buf.getbuffer()).decode("ascii") return f"<img src='data:image/png;base64,{data}'/>" if __name__ == "__main__": import historical.util as util data = [ 0.00, 0.00, 0.00, 0.00, 0.03, 0.02, 0.06, 0.07, 0.07, 0.10, 0.13, 0.12, 0.10, 0.15, 0.12, 0.13, 0.06, 0.14, 0.10, 0.10, 0.07, 0.14 ] ys = list(util.iter_yearseason()) print(get_plot_as_img(ys, data))
def find_originators(n, metric, thresh, score_by=1, randomize=False, prune=False): global dom_hist_cache if PARALLEL: pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() // 4) if metric == "top": top_phrases = set() for yearseason in util.iter_yearseason(): m = -1 for t in util.load_grams(n, yearseason): num = int(t[0]) if num < 10: break top_phrases.add(t[1]) phrases = list(top_phrases) del top_phrases else: phrases = list(zip(*util.load_top_phrases(n, t=metric)))[0] if randomize and dom_hist_cache is not None: #We don't need to refetch the history for multiple runs dom_hist = dom_hist_cache else: if False: scan_for_phrase_doms_n_fixed = functools.partial( scan_for_phrase_doms, n=n) BATCH_SIZE = 10 batches = (phrases[i:i + BATCH_SIZE] for i in range(0, len(phrases), BATCH_SIZE)) dom_hist = {} dh_batches = pool.map(scan_for_phrase_doms_n_fixed, batches) for dh in dh_batches: dom_hist.update(dh) else: dom_hist = scan_for_phrase_doms(phrases, n) if randomize: dom_hist_cache = dom_hist #Don't save history unless we need to dom_hist = randomize_dom_hist(dom_hist) originators = {} for phrase in dom_hist: if PARALLEL: break originators_changes = find_originators_for_phrase( phrase, dom_hist[phrase], thresh) for d in originators_changes: if d not in originators: originators[d] = originators_changes[d] else: originators[d] += originators_changes[d] if PARALLEL: for originators_changes in pool.map(_find_originators_for_phrase, ((phrase, dom_hist[phrase], thresh) for phrase in dom_hist)): for d in originators_changes: if d not in originators: originators[d] = originators_changes[d] else: originators[d] += originators_changes[d] if (prune): prune_overlapping(originators) originators_l = [] for d, l in originators.items(): if score_by == 0: score = len(l) elif score_by == 1: score = sum([len(t[2]) / len(t[1]) for t in l]) else: raise Exception("Unrecognized metric %d" % score_by) originators_l.append((d, score, l)) return sorted(list(originators_l), key=lambda x: -x[1])
def get_total_policies_in_families(): for ys in util.iter_yearseason(): util.__init_domains_cache(ys) yield sum((len(s) for s in util.domainsets_cache[ys].values()))
def main(): global start, stop, topN, SENTENCES, WORDS, ENTITIES, ys_list if MEM_DEBUG: tracemalloc.start(10) parser = argparse.ArgumentParser( description='Runs basic analytics over pre-sorted n-grams') parser.add_argument(dest="N", type=int, help='Number of output phrases per metric') parser.add_argument( dest="grams", type=str, nargs="+", help="Gram types to include. Numerical or any of 'emnsuw'") parser.add_argument('-s', dest="sentences", action='store_const', const=True, default=False, help='Examine sentences') parser.add_argument('-w', dest="words", action='store_const', const=True, default=False, help='Examine words') parser.add_argument('-e', dest="entities", action='store_const', const=True, default=False, help='Examine entities') util.add_arguments(parser) args = parser.parse_args() #Arguments: #analytics.py <MIN> <MAX> <N> (sw) #Finds the top N n-grams for each n \in [MIN .. MAX] #"s" in the last argument indicates including sentences, "w" words. Blank for nothing topN = args.N gram_list = [] for gram in args.grams: try: #If it's an integer, add it gram_list.append(int(gram)) except: for char in list(gram): if char not in "swenum": raise Exception("Illegal gram: %s" % char) gram_list.append(char) util.process_arguments(args) util.CACHE_DB = False ys_list = list(util.iter_yearseason()) do_analytics(gram_list)
def load_top_grams(n, domain_norm_factor, policy_norm_factor, domain_counts): #Step 1 -- filter top phrases top_phrases = set() #if util.DO_SAMPLE: # tp_cutoffs = {ys: dc * 0.01 for ys, dc in domain_counts.items()} #else: # tp_cutoffs = {ys: dc * 0.005 for ys, dc in domain_counts.items()} if util.DO_SAMPLE: tp_cutoffs = {ys: 0.01 for ys, dc in domain_counts.items()} else: tp_cutoffs = {ys: 0.005 for ys, dc in domain_counts.items()} limit = min(tp_cutoffs.values()) for yearseason, t in ioutils.load_grams_parallel( n, recount=True, limit=limit, domain_norm_factor=domain_norm_factor, policy_norm_factor=policy_norm_factor): if t[0]["unique"] >= tp_cutoffs[yearseason]: top_phrases.add(t[1]) logging.info("\t%s-grams: Identified %d top phrases" % (n, len(top_phrases))) mem_count() #Sanity check if len(top_phrases) == 0: logging.error("Found no top phrases for grams: %s\n" % str(n)) return None #Step 2 -- find counts for top phrases gram_freq_by_countfxn = { count_name: {s: [0] * num_intervals for s in top_phrases} for count_name in util.count_fxns } gram_freq_by_countfxn["raw"] = { s: [[] for _ in range(num_intervals)] for s in top_phrases } ys_idx_map = { ys: idx for idx, ys in enumerate(list(util.iter_yearseason())) } ct = 0 for yearseason, (counts, s, *doms) in ioutils.load_grams_parallel( n, search_for=frozenset(top_phrases), recount=True, domain_norm_factor=domain_norm_factor, policy_norm_factor=policy_norm_factor): if ct % 100 == 0: print("\tLoaded %d phrase-intervals" % ct, end='\r') ct += 1 ys_idx = ys_idx_map[yearseason] gram_freq_by_countfxn["raw"][s][ys_idx] = doms for count_name, count in counts.items(): gram_freq_by_countfxn[count_name][s][ys_idx] = count print("\033[K", end="\r") logging.info("\tData loaded") mem_count() return gram_freq_by_countfxn
def do_analytics(gram_list): global SLOPE_COUNT_THRESH, SLOPE_PSEUDOCOUNT webfn, weburl = util.get_web_fn("raw", "analytics.html") logging.info("Access at %s" % weburl) #logging.info("Pre-loading Alexa results") #alexa.load() logging.info("Starting Counting intervals") num_intervals = len(list(util.iter_yearseason())) logging.info("Starting counting policies") domain_counts, policy_counts = get_policy_counts() average_policy_count = sum(policy_counts.values()) / len(policy_counts) policy_norm_factor = { yearseason: 1 / policy_counts[yearseason] for yearseason in util.iter_yearseason() } average_domain_count = sum(domain_counts.values()) / len(domain_counts) domain_norm_factor = { yearseason: 1 / domain_counts[yearseason] for yearseason in util.iter_yearseason() } with open(webfn, "w+") as webf: webf.write(""" <html> <head> <link rel="stylesheet" type="text/css" href="/styles/style.css"/> </head> <body> """) webf.write(""" <div class="query"> <iframe id="queryframe" src="https://privacypolicies.cs.princeton.edu/search_policies.php?basedir=%s"> </iframe> </div> <div class="metricsd"><table class="metricst"> """ % util.testing_prefix) #Iterate phrase type for n in gram_list: ngram_name = "%s-grams" % n if n not in friendly_names else friendly_names[ n] webf.write(""" <tr class="nhtr"> <td><h3>Data for %s</h3></td> </tr> <tr class="ntr">""" % ngram_name) logging.info("Loading top grams for %s-grams" % (n)) gram_freq_by_countfxn = load_top_grams(n, domain_norm_factor, policy_norm_factor, domain_counts) logging.info("Done loading top grams for %s-grams" % (n)) mem_trace() if gram_freq_by_countfxn is None: logging.error("No grams") continue gram_freq_raw = gram_freq_by_countfxn["raw"] #Iterate counting methods i.e. unique, total, or alexa-weighted domains for count_name, (countf, count_friendly_name) in util.count_fxns.items(): logging.info("\tBeginning scoring with %s" % (count_name)) #Score based on various metrics gram_freq = gram_freq_by_countfxn[count_name] base_count = None if count_name == "total": base_count = 1 / average_domain_count elif count_name == "unique": base_count = 1 / average_policy_count elif count_name == "alexa": base_count = alexa.average_traffic slope_count_thresh = get_slope_thresh(gram_freq, base_count) slope_pseudocount = base_count logging.info("\tSlope thresh for round is: %0.4E" % slope_count_thresh) logging.info("\tSlope pseudocount for round is: %0.4E" % slope_pseudocount) gram_scores = {mname: [] for mname in metrics} #Identifying phrases of interest based on metrics & rules for s in gram_freq: vals = gram_freq[s] for (mname, (score_fxn, hname)) in metrics.items(): heap = gram_scores[mname] score = score_fxn(vals, slope_pseudocount, slope_count_thresh) if score == -100000: continue if len(heap) >= topN: heapq.heappushpop(heap, (score, s)) else: heapq.heappush(heap, (score, s)) logging.info("\tDone scoring") webf.write('<td class="dataCb">\n') webf.write("<h4>Counted by %s</h4>\n" % (count_friendly_name)) score_name_str = "Top %s by %%s counted by %s</h4>\n" % ( ngram_name, count_friendly_name) #Dump data for mname, (fsc, hname) in metrics.items(): heap = gram_scores[mname] logging.info("\tSorting top values for %s" % (mname)) #Heaps aren't sorted, we need to sort the heap #Taking advantage of the heap structure doesn't help us here... pop is log(n), and we need n iterations heap = sorted(heap, reverse=True) phrases = [s for sc, s in heap] groups = [ phrases[10 * i:min(10 * (i + 1), len(phrases))] for i in range(math.ceil(len(phrases) / 10)) ] scores = {s: sc for sc, s in heap} logging.info("\tStarting data dump for %s" % (mname)) nwebfn, nurl = create_data_dump( n, ["metrics", mname], "%s-grams_top_%s_%s%s" % (n, mname, count_name, util.get_file_suffix()), groups, gram_freq, gram_freq_raw, score_name_str % hname, scores=scores) webf.write( '<span style="margin-left:2em"><a href="%s" target="_blank">%s</a></span></br>\n' % (nurl, hname)) print("\033[K\t\tGraphs created", end="\r") webf.flush() webf.write('<br/></td>\n') #Best to force a flush for partial readouts webf.flush() os.fsync(webf) mem_trace() print("\033[K", end="\r") logging.info("\tDone dumping data") mem_count() webf.write('</tr>') util.close_pool() webf.write(""" </table> </div> </body> </html>""") logging.info("Done")
n = len(phrases[0].split(' ')) counts = {phrase:{ys:0 for ys in util.iter_yearseason()} for phrase in phrases} for ys in util.iter_yearseason(): unmet_phrases = set(phrases) count_adjust = norm_params[ys] for row in util.load_grams(n, ys): rphrase=row[1] for phrase in unmet_phrases: if rphrase == phrase: unmet_phrases.remove(phrase) count=row[0] counts[phrase][ys]=int(count)/count_adjust break #Only one phrase can match if len(unmet_phrases) == 0: break #No more phrases to be found this interval return counts if __name__ == "__main__": #TODO argument parsing phrase = sys.argv[1] tags = sys.argv[2] norm_params = dict(zip([ys for ys in util.iter_yearseason()],[float(i) for i in sys.argv[3].split(' ')])) if len(sys.argv) == 5: n_override = sys.argv[4] else: n_override = None res = scan(phrase,norm_params,n_override=n_override) for phrase in res: print("%s,%s" % (phrase, ",".join([str(res[phrase][ys]) for ys in util.iter_yearseason()])))
write_obj.append({"id": i, "domains": l}) i += 1 json.dump(write_obj, f) def make_dirs(intervals): for ys in intervals: Path("../data/text_sim/%s/" % ys).mkdir(parents=True, exist_ok=True) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Find and flag duplicate documents') parser.add_argument('intervals', type=str, nargs='+', help='Which intervals to process over. "all" scans all intervals in sequence') parser.add_argument('--sample', dest="sample_fdd", action='store_const', const=True, default=False, help='Sample') util.add_arguments(parser) args = parser.parse_args() intervals = args.intervals SAMPLE = args.sample_fdd util.process_arguments(args) if intervals[0] == "all": intervals = list(util.iter_yearseason()) make_dirs(intervals) for interval in intervals: make_textsim_graph(interval) util.close_pool()
import plotly import math #WEB_PREFIX = "https://cs.princeton.edu/~rbamos" #WEB_DIR = os.path.expanduser("~/public_html/") if __name__ == "__main__": if len(sys.argv) != 2: print("""Usage: python3 -m historical.draw_freq_dist <n>""") exit(-1) n = sys.argv[1] np.set_printoptions(threshold=sys.maxsize) yss = [ys for ys in util.iter_yearseason()] fig = go.Figure() b = 2 max_count = 0 for ys in yss: freqs = [r[0] for r in util.load_grams(n, ys)] max_count = max(max_count, len(freqs)) log_index = [ int(math.pow(b, i)) for i in range(int(math.log(max_count, b))) ] for ys in yss: freqs = [r[0] for r in util.load_grams(n, ys)] freqs2 = [freqs[i] if i < len(freqs) else 0 for i in log_index] fig.add_trace(go.Scatter(x=log_index, y=freqs2, mode='lines', name=ys))