def count_clicks(src): print "Processing %s" % src total = 0 data = read_vm_file(src) for referrer, target, num_clicks in data: total += num_clicks return total
def read_counts(src): print "Processing %s" % src data = read_vm_file(src) counts = [] for _, _, num_clicks in data: counts.append(num_clicks) return counts
def change_domain_levels(src, dest, new_domain_level): print "Processing", src data = read_vm_file(src) clicks = {} for referrer, target, num_clicks in data: if ( referrer is None or target is None or num_clicks == -1 or domain_level(referrer) == 1 or domain_level(target) == 1 ): continue newr = change_domain_level(referrer, new_domain_level) newt = change_domain_level(target, new_domain_level) if newr not in clicks: clicks[newr] = {} if newt not in clicks[newr]: clicks[newr][newt] = 0 clicks[newr][newt] += num_clicks with open(dest, "w") as destf: writer = csv.writer(destf, delimiter="\t") for referrer in clicks: for target in clicks[referrer]: writer.writerow([referrer, target, clicks[referrer][target]])
def convert_to_json(src, dest): print "Processing %s" % src data = read_vm_file(src) click_counts = {} for referrer, target, num_clicks in data: if target not in click_counts: click_counts[target] = 0 click_counts[target] += num_clicks total_count = float(numpy.sum(click_counts.values())) sorted_counts = sorted(click_counts.items(), key=lambda tupl: tupl[1], reverse=True) cum_count = 0 json = "" for target, count in sorted_counts: cum_count += count json += ' {"name": "%s", "size": %d}' % (target, count) if cum_count / total_count > .3: json += "\n" break else: json += ",\n" with open(dest, 'w') as destf: destf.write("""{ "name": "entropy", "children": [ %s ] } """ % json)
def compute_traffic_volume(src): print "Processing %s" % src total = 0 data = read_vm_file(src) for referrer, target, num_clicks in data: total += num_clicks return total
def count_clicks(files): total = 0 for src in files: print "Processing", src data = read_vm_file(src) for referrer, target, num_clicks in data: total += 1 return total
def create_categories(src, dest, hosts): print "Processing %s" % dest data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: if is_member(referrer, hosts): writer.writerow([referrer, target, num_clicks])
def filter_targets(src, dest): print "Processing %s" % src data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: if not should_skip_host(target): writer.writerow([referrer, target, num_clicks])
def filter_referrers(src, dest): print "Processing %s" % src data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: if not should_skip_host(referrer): writer.writerow([referrer, target, num_clicks])
def count_targets(files): targets = set() for src in files: print "Processing", src data = read_vm_file(src) for referrer, target, num_clicks in data: if target not in targets: targets.add(target) return len(targets)
def filter_news_junk(src, dest): print "Processing %s" % src data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: target = normalize_url(target) if not should_skip_host(target): writer.writerow([referrer, target, num_clicks])
def show_top_targets(src): print "Processing %s" % src data = read_vm_file(src) click_counts = {} for referrer, target, num_clicks in data: if target not in click_counts: click_counts[target] = 0 click_counts[target] += num_clicks return click_counts
def count_clicks_from_referrer(files, referrers): counts = {} for src in files: print "Processing", src data = read_vm_file(src) for referrer, target, num_clicks in data: if referrer in referrers: if not referrer in counts: counts[referrer] = 0 counts[referrer] += 1 return counts
def remove_unwanted(src, dest): print "Processing %s" % src data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: try: r = normalize_url(referrer) t = normalize_url(target) if not should_skip_host(t): writer.writerow([r, t, num_clicks]) except: print "Couldn't normalize. Skipping." print referrer print target
def combine_files(files, destfile): print "Processing %s" % destfile clicks = {} for f in files: data = read_vm_file(f) for referrer, target, num_clicks in data: if referrer not in clicks: clicks[referrer] = {} if target not in clicks[referrer]: clicks[referrer][target] = 0 clicks[referrer][target] += num_clicks with open(destfile, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer in clicks: for target in clicks[referrer]: writer.writerow([referrer, target, clicks[referrer][target]])
def smooth_vm(files, dest): print "Processing %s" % dest vms = [read_vm_file(f) for f in files] data = {} for vm in vms: for referrer, target, num_clicks in vm: if referrer not in data: data[referrer] = {} if target not in data[referrer]: data[referrer][target] = 0 data[referrer][target] += num_clicks with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer in data.keys(): for target in data[referrer].keys(): writer.writerow([referrer, target, data[referrer][target]])
def count_top_targets_per_file(src, dest): print "Processing %s" % src data = read_vm_file(src) click_counts = {} for referrer, target, num_clicks in data: if target not in click_counts: click_counts[target] = 0 click_counts[target] += num_clicks total_count = float(numpy.sum(click_counts.values())) sorted_counts = sorted(click_counts.items(), key=lambda tupl: tupl[1], reverse=True) cum_count = 0 with open(dest, "w") as destf: writer = csv.writer(destf, delimiter="\t") for target, count in sorted_counts: cum_count += count writer.writerow([target, count, (total_count - cum_count) / total_count])
def index_vms(files): print "Loading index." vms = {} for filepath in files: data = read_vm_file(filepath) for referrer, target, num_clicks in data: if referrer not in vms: vms[referrer] = {} if target not in vms[referrer]: vms[referrer][target] = 0 vms[referrer][target] += num_clicks index = [] total = 0 for referrer in vms: for target in vms[referrer]: total += vms[referrer][target] index.append((referrer, target, total)) return index
def count_top_targets_per_file(src, dest): print "Processing %s" % src data = read_vm_file(src) click_counts = {} for referrer, target, num_clicks in data: if target not in click_counts: click_counts[target] = 0 click_counts[target] += num_clicks total_count = float(numpy.sum(click_counts.values())) sorted_counts = sorted(click_counts.items(), key=lambda tupl: tupl[1], reverse=True) cum_count = 0 with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for target, count in sorted_counts: cum_count += count writer.writerow( [target, count, (total_count - cum_count) / total_count])
def change_domain_levels(src, dest, new_domain_level): print "Processing", src data = read_vm_file(src) clicks = {} for referrer, target, num_clicks in data: if referrer is None or target is None or num_clicks == -1\ or domain_level(referrer) == 1 or domain_level(target) == 1: continue newr = change_domain_level(referrer, new_domain_level) newt = change_domain_level(target, new_domain_level) if newr not in clicks: clicks[newr] = {} if newt not in clicks[newr]: clicks[newr][newt] = 0 clicks[newr][newt] += num_clicks with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer in clicks: for target in clicks[referrer]: writer.writerow([referrer, target, clicks[referrer][target]])
def sample_vm(src, dest, num_clicks_to_sample): return sample_vm_data(read_vm_file(src), dest, num_clicks_to_sample)