def find_changed_policies():
    policies_cache = defaultdict(lambda: defaultdict(dict))
    for h, d, y, s in util.get_pool().map(
            fnd.hash_text,
        ((data["policy_text"], data["site_url"], data["year"], data["season"])
         for data, cols in util.load_all_policies())):
        policies_cache[d][y][s] = h

    changed_pols = defaultdict(lambda: defaultdict(lambda: 0))
    all_pols = defaultdict(lambda: defaultdict(lambda: 0))

    for dom in policies_cache:
        print(policies_cache[dom])
        prev_pol = None
        for y, s in util.iter_year_season():
            try:
                print(y, s)
                print(policies_cache[dom][y])
                pol = policies_cache[dom][y][s]
            except KeyError:
                continue
            if prev_pol is not None:
                if len(simhash.find_all([prev_pol, pol], 4, 3)) != 0:
                    changed_pols[y][s] += 1
                all_pols[y][s] += 1
    return ([changed_pols[y][s] for y, s in util.iter_year_season()],
            [all_pols[y][s] for y, s in util.iter_year_season()])
Exemplo n.º 2
0
def draw_html_and_reg_heatmaps(n, phrases, raw_data):
    print("\033[KFinding pre-drawn heatmaps", end='\r')
    r_urls = {
        p: get_heatmap_url(n, p)
        for p in phrases if os.path.exists(get_heatmap_fn(n, p))
    }
    r_phrases_filtered = set(
        [p for p in phrases if not os.path.exists(get_heatmap_fn(n, p))])

    h_urls = {
        p: get_html_heatmap_url(n, p)
        for p in phrases if os.path.exists(get_html_heatmap_fn(n, p))
    }
    h_phrases_filtered = set(
        [p for p in phrases if not os.path.exists(get_html_heatmap_fn(n, p))])

    print("\033[KFiltering out pre-drawn heatmaps", end='\r')
    phrases = list(r_phrases_filtered.union(h_phrases_filtered))
    #print("Making heatmaps for %d phrases" % len(phrases))

    print("\033[KFetching heatmap data", end='\r')
    ars, ls = util.fetch_domains_for_phrases(phrases, n, raw_data)
    hm_html_args = []
    hm_reg_args = []

    print("\033[KBuilding args", end='\r')
    for p in phrases:

        ar = ars[p]
        l = ls[p]

        if p in h_phrases_filtered:
            #h_urls[p] = draw_html_heatmap_from_data(n,p,ar,l)
            hm_html_args.append((n, p, ar, l))
        if p in r_phrases_filtered:
            #r_urls[p] = draw_heatmap_from_data(n,p,ar,l)
            hm_reg_args.append((n, p, ar, l))

    pool = util.get_pool()

    print("\033[KDrawing HTML heatmaps", end='\r')
    for p, url in pool.starmap(wrapper_draw_html_heatmap_from_data,
                               hm_html_args):
        h_urls[p] = url
    print("\033[KDrawing graphical heatmaps", end='\r')
    for p, url in pool.starmap(wrapper_draw_heatmap_from_data, hm_reg_args):
        r_urls[p] = url

    print("\033[KDone drawing heatmaps", end='\r')
    return r_urls, h_urls
Exemplo n.º 3
0
def make_textsim_graph(filtername):    
    try:
        print("making dirs: %s" % ("../data/text_sim/%s/" % filtername))
        os.mkdirs("../data/text_sim/%s/" % filtername)
    except:
        pass
    
    p = util.get_pool()
    args = []
    for data,cols in ioutils.load_all_policies(limit=-1, filtername=filtername):
        text=data["policy_text"]
        domain=data["site_url"]
        year=str(data["year"])
        season=data["season"]
        args.append((text, domain, int(year), season))
    print("Total docs is %d" % len(args))
            
    simhashes = {}
    all_hashes = []
    sentences = {}
    for h, sentence, domain, year, season in p.map(hash_text, args):
        if sentence not in sentences:
            sentId = len(sentences)
            sentences[sentence] = sentId
        else:
            sentId = sentences[sentence]

        if h not in simhashes:
            simhashes[h] = {}
        simhashes[h][(domain,year,season)] = sentId
        all_hashes.append(h)

    matches = simhash.find_all(all_hashes,SIMHASH_THRESH+1,SIMHASH_THRESH)


    sentence_inv = {}
    for s in sorted(sentences, key=lambda x:sentences[x]):
        i = sentences[s]
        sentence_inv[i] = s
    del sentences


    lzma_filters = my_filters = [
        {
            "id": lzma.FILTER_LZMA2, 
            "preset": 9 | lzma.PRESET_EXTREME, 
            "dict_size": 100000, #~10k words in english speaker's vocab, x10 for good measure
            "lc": 3,
            "lp": 0,
            "pb": 0, # assume ascii
            "mode": lzma.MODE_NORMAL,
            "nice_len": 273,
            "mf": lzma.MF_BT4
        }
    ]
        
    adj = {}
    adj_sen = {}
    adj_sen_dom = {}
    self_match = [(h,h) for h in simhashes if len(simhashes[h]) > 1]

    if SAMPLE:
        dist_bins = [[] for i in range(10)]

    accepted = 0
    rejected = 0
    rejected_low_pass = 0
    for l,r in itertools.chain(matches, self_match):
        lpols = simhashes[l].keys()
        rpols = simhashes[r].keys()
        ldomains = set((dom for dom, _, _ in simhashes[l]))
        rdomains = set((dom for dom, _, _ in simhashes[r]))
        domains = ldomains.union(rdomains)
        if l == r or len(domains) > max(len(ldomains),len(rdomains)):
            for ld, ly, ls in lpols:
                for rd, ry, rs in rpols:
                    lt = ly * 10 + seasonToOrd[ls]
                    rt = ry * 10 + seasonToOrd[rs]
                    if lt == rt:
                        if CROSS_YEAR_ONLY:
                            continue
                        else:
                            first = "%d%s_%s" % (ly, ls, ld)
                            second = "%d%s_%s" % (ry, rs, rd)
                            pass
                    elif lt < rt:
                        first = "%d%s_%s" % (ly, ls, ld)
                        second = "%d%s_%s" % (ry, rs, rd)
                    else:
                        first = "%d%s_%s" % (ry, rs, rd)
                        second = "%d%s_%s" % (ly, ls, ld)
                    if first not in adj:
                        adj[first] = []
                    adj[first].append(second)

                    lId = simhashes[l][ld,ly,ls]
                    rId = simhashes[r][rd,ry,rs]
                    if FUZZ_THRESH == 0 and not USE_NCD:
                        #Anything will pass, no need to compute
                        comp_dist = 100
                    else:
                        comp_dist = check_distance(lId,rId,sentence_inv, lzma_filters)
                    if USE_NCD:
                        if len(sentence_inv[lId]) + len(sentence_inv[rId]) < 200:
                            comp_dist -= 0.3 #Magic offset because NCD doesn't work well on small text
                        if comp_dist > NCD_THRESH:
                            print("Ruled out %s x %s (%f, %s, %s)" % (ld, rd, comp_dist, hex(l), hex(r)))
                            rejected += 1
                            continue
                    else:
                        if SAMPLE:
                            if comp_dist != 100 and comp_dist >= 90:
                                dist_bins[100 - (comp_dist + 1)].append((lId,rId,comp_dist))
                        if comp_dist < 90:
                            rejected_low_pass += 1
                        if comp_dist < FUZZ_THRESH:
#                            print("Ruled out %s x %s (%f, %s, %s)" % (ld, rd, comp_dist, hex(l), hex(r)))
                            rejected += 1
                            continue
                    accepted += 1

                    if lId not in adj_sen_dom:
                        adj_sen_dom[lId] = set()
                    adj_sen_dom[lId].add(first)
                    adj_sen_dom[lId].add(second)

                    if lId not in adj_sen:
                        adj_sen[lId] = set()
                    adj_sen[lId].add(rId)
                    if rId not in adj_sen:
                        adj_sen[rId] = set()
                    adj_sen[rId].add(lId)

    adj_rev, adj_rep = bfs(adj)

    print("Accepted: %d, rejected: %d, low pass: %d" % (accepted, rejected, rejected_low_pass))

    if SAMPLE:
        for i in range(len(dist_bins)):
            with open("../data/text_sim/sample_0_%d.txt" % i, "w+") as f:
                if len(dist_bins[i]) <= 50:
                    sample = dist_bins[i]
                else:
                    sample = random.sample(dist_bins[i],10)
                for lId,rId,comp_dist in sample:
                    #print(lId,rId,comp_dist)
                    s1 = sentence_inv[lId]
                    s2 = sentence_inv[rId]
                    with open("../data/text_sim/s1_tmp.txt","w+") as f1: f1.write(s1)
                    with open("../data/text_sim/s2_tmp.txt","w+") as f1: f1.write(s2)
                    try:
                        diff = subprocess.check_output("echo \"diff -y <(fold -s -w72 ../data/text_sim/s1_tmp.txt) <(fold -s -w72 ../data/text_sim/s2_tmp.txt) -W 200; exit 0\" | bash", shell=True)
                    except subprocess.CalledProcessError as e:
                        if e.returncode == 2:
                            print(e.output)
                            sys.exit(1)
                        diff = e.output
                    diff = diff.decode()
                    f.write("%s\n%d\n%s\n" % ("="*40,comp_dist,"-"*40))
                    f.write("%s\n" % (diff))

    with open("../data/text_sim/%s/policy_links.json" % filtername, "w+") as f:
        write_obj = []
        i = 0
        for s in adj_rep:
            l = [dom[6:] for dom in adj_rep[s]]
            write_obj.append({"id": i, "domains": l})
            i += 1
        json.dump(write_obj, f)
Exemplo n.º 4
0
def create_data_dump(n,
                     folders,
                     basename,
                     groups,
                     gram_freq,
                     gram_freq_raw,
                     score_name_str,
                     scores=None,
                     gen_figs=False):
    """
    Take the given phrases and create:
    - A table of phrase, score (if available), occurances
    - Line graphs for phrase occurance
    - Heatmaps for phrases
    """
    nwebfn, nurl = util.get_web_fn(*folders, "%s.html" % (basename))
    group_to_graph = {}
    hit_to_graph = {}

    #Part one -- create CSV data files, heatmaps, and line graphs
    for group_num in range(len(groups)):
        group_hits = groups[group_num]
        nfn = os.path.join(util.OUT_DIR, "%s_%d.csv" % (basename, group_num))
        if gen_figs:
            #Line graph
            line_url = draw_lines.draw_lines(nfn, -1, ((hit, gram_freq[hit])
                                                       for hit in group_hits))
            logging.info("\t\tDone drawling lines for group %d" % (group_num))
            #Heatmp
            if n == 'w':
                html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls(
                    n, group_hits)
            else:
                heatmap_urls, html_heatmap_urls = draw_heatmap.draw_html_and_reg_heatmaps(
                    n, group_hits, gram_freq_raw)
            logging.info("\033[K\t\tDone drawing heatmaps for group %d" %
                         (group_num))
        else:  #Skip drawing
            line_url = draw_lines.get_lines_url(nfn, -1)
            heatmap_urls = draw_heatmap.get_multiple_heatmap_urls(
                n, group_hits)
            html_heatmap_urls = draw_heatmap.get_multiple_html_heatmap_urls(
                n, group_hits)
        group_to_graph[group_num] = line_url
        with open(nfn, "w+") as f:  #Write CSV data file for later use
            writer = csv.writer(f)
            for hit in group_hits:
                vals_str = ["%0.2f" % f for f in gram_freq[hit]]
                writer.writerow([
                    hit.replace("\\", "\\\\").replace("\n", "\\n"), *vals_str
                ])
                #heatmap_url = heatmap_urls[hit]
                heatmap_url = html_heatmap_urls[hit]
                #                print(heatmap_url)
                hit_to_graph[hit] = heatmap_url
        logging.info("\t\tDone drawing group %d" % (group_num))

    #Part two -- create usable web page to explore
    with open(nwebfn, "w+") as nwebf:
        nwebf.write("""<html>
<head>
<style>
table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
</style>
</head>
<body>
""")
        pool = util.get_pool()
        all_hits = list(
            itertools.chain(*[[hit for hit in group] for group in groups]))

        plots = pool.starmap(miniplot.get_plot_as_img,
                             [(ys_list, gram_freq[hit]) for hit in all_hits])
        plots = list(plots)
        logging.info("\t\tDone drawing plots")

        #Break up by group
        pNum = 0
        for group, gid in zip(groups, range(len(groups))):
            if len(group) == 0: continue
            line_graph_url = group_to_graph[gid]
            nwebf.write('<h3>%s</h3>' % (score_name_str))
            nwebf.write(
                '<h4><a href="%s" target="_blank">Line graph for group %d</a></h4>\n'
                % (line_graph_url, gid))
            nwebf.write('<table>\n')
            headers = ["Score", "Phrase", "Bar Plot"] + list(
                util.iter_yearseason())
            if scores is None:
                headers = headers[1:]
            nwebf.write('<tr><th>%s</th></tr>' % '</th><th>'.join(headers))
            #Then phrase
            for hit in group:
                nwebf.write('<tr>\n')
                if scores is not None:
                    if abs(scores[hit]) < 0.01 and scores[hit] != 0:
                        nwebf.write('<td>%0.2E</td>' % scores[hit])
                    else:
                        nwebf.write('<td>%0.2f</td>' % scores[hit])
                nwebf.write('<td width="30%">\n')
                #vals_str = ["%0.2f" % f for f in gram_freq[hit]]
                vals_str = [
                    ("%0.2E" if
                     (abs(gram_freq[hit][ysid]) != 0
                      and abs(gram_freq[hit][ysid]) < 0.01) else "%0.2f") %
                    (gram_freq[hit][ysid]) for ysid in range(num_intervals)
                ]

                heatmap_graph_url = hit_to_graph[hit]
                hit_link = '<a href="%s" target="_blank">%s</a>' % (
                    heatmap_graph_url, hit.replace("\\", "\\\\").replace(
                        "\n", "\\n"))

                plot = plots[
                    pNum]  # miniplot.get_plot_as_img(ys_list,gram_freq[hit])
                pNum += 1

                nwebf.write('</td><td>'.join([hit_link, plot,
                                              *vals_str]))  #TODO add miniplot
                nwebf.write('</td></tr>\n')
            logging.info("\t\tDone with table for group %d" % (gid))

            nwebf.write('</table>\n')
        nwebf.write("</body></html>")
    mem_count()
    return nwebfn, nurl
Exemplo n.º 5
0
def clean_parallel():
    global data_cache
    
    cols = ("crawl_time", "site_url", "homepage_snapshot_url", "policy_snapshot_url", "year", "season", "policy_text", "policy_filetype", "visit_info", "entities", "emails", "urls", "nums")
    loaded_docs = {}

    pool = util.get_pool()

    data_cache = {} if CACHE_RES else None
    
    ct = 0
    policies_it = ioutils.load_all_policies(clean=False)
    next_data_ar = list(itertools.islice(policies_it,100*util.WORKERS))

    blacklist = util.get_blacklist()
    
    while True:
        data_ar = next_data_ar

        if len(data_ar) == 0:
            break

        args = [(data["policy_text"],data["policy_source"])
                if (data["homepage_snapshot_url"] not in blacklist) else
                (None,None)
                for data,_ in data_ar ]
        result = pool.starmap_async(clean_doc, args)
        next_data_ar = list(itertools.islice(policies_it,100*util.WORKERS))
        res = list(result.get())
        

        to_write = []
        resnum = 0
        for i in range(len(data_ar)):
            text,entities,emails,urls,nums = res[i]
            if text is None:
                continue
            
            data,_ = data_ar[i]
            data = dict(data)
            
            data["policy_text"] = text
            del data["policy_source"]
            data["entities"] = entities
            data["emails"] = emails
            data["urls"] = urls
            data["nums"] = nums
            cols = data.keys()
            to_write.append(data)

            domain = data["site_url"]
            year = data["year"]
            season = data["season"]
            if domain not in loaded_docs:
                loaded_docs[domain] = set()
            loaded_docs[domain].add((year,season))

            if data_cache is not None:
                data_cache[(domain,year,season)] = data
            
            ct += 1
            if ct % 100 == 0:
                print("%d/%d done" % (ct,total),end='\r')
                
        ioutils.write_policy(to_write, cols)
        ioutils.flush_db()

    util.close_pool()

    return loaded_docs