示例#1
0
def remove_unwanted(src, dest):
	print "Processing %s" % src
	data = read_vm_file(src)
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer, target, num_clicks in data:
			try:
				r = normalize_url(referrer)
				t = normalize_url(target)
				if not should_skip_host(t):
					writer.writerow([r, t, num_clicks])
			except:
				print "Couldn't normalize. Skipping."
				print referrer
				print target
示例#2
0
def prune_news_dataset(news_sources_file):
    f = open(news_sources_file, 'r')
    news_urls = set()
    for line in f:
        if '://' in line:
            host = line[line.index('://') + 3:]
        else:
            host = line
        host = host.strip().split('/')[0]

        host = normalize_url(host)

        if host in news_urls or host in UNWANTED_URLS or fnmatches_multiple(
                EXCEPTION_PATTERNS, host):
            continue

        news_urls.add(host)

    for host in sorted(list(news_urls)):
        disregard = False
        for parent in parents(host):
            if parent in news_urls or parent in UNWANTED_URLS:
                disregard = True
                break
        if not disregard:
            print host
def filter_news_junk(src, dest):
	print "Processing %s" % src
	data = read_vm_file(src)
	with open(dest, 'w') as destf:
		writer = csv.writer(destf, delimiter="\t")
		for referrer, target, num_clicks in data:
			target = normalize_url(target)
			if not should_skip_host(target):
				writer.writerow([referrer, target, num_clicks])
def read_raw_aol_data(filepath):
    clicks = []
    with open(filepath, 'r') as f:
        reader = csv.reader(f, delimiter="\t")
        reader.next()  # skip header
        for row in reader:
            try:
                user = int(row[0])
                dest = normalize_url(row[4])
                clicks.append((user, dest))
            except:
                print "Failed:", row
    return clicks
def read_raw_aol_data(filepath):
	clicks = []
	with open(filepath, 'r') as f:
		reader = csv.reader(f, delimiter="\t")
		reader.next() # skip header
		for row in reader:
			try:
				user = int(row[0])
				dest = normalize_url(row[4])
				clicks.append((user, dest))
			except:
				print "Failed:", row
	return clicks
示例#6
0
def read_raw_twitter_data(filepath):
	tweets = []
	with open(filepath, 'r') as f:
		for line in f:
			try:
				parts = line.split("|")

				user = int(parts[0])
				dtstr = parts[1]
				friends = int(parts[-2])
				followers = int(parts[-3])
			
				dest = "".join(parts[2:-3])
				dest = normalize_url(dest)
				tweets.append((user, dest, followers, friends))
			except:
				print "Could not parse line: %s\t%s" % (line, filepath)
	return tweets
def prune_news_dataset(news_sources_file):
	f = open(news_sources_file, 'r')
	news_urls = set()
	for line in f:
		host = line.strip().split('/')[0]
		
		host = normalize_url(host)

		if host in news_urls or host in UNWANTED_URLS or fnmatches_multiple(EXCEPTION_PATTERNS, host):
			continue

		news_urls.add(host)
	
	for host in sorted(list(news_urls)):
		disregard = False
		for parent in parents(host):
			if parent in news_urls or parent in UNWANTED_URLS:
				disregard = True
				break
		if not disregard:
			print host
def validate_requests(src, valid_dest, invalid_dest):
	print "Processing", src

	# extract a timestamp from from the filename to validate against
	file_dt = parse_dt_from_filename(src)
	
	with open(valid_dest, 'w') as valid_destf:
		valid_writer = csv.writer(valid_destf, delimiter="\t")

		with open(invalid_dest, 'w') as invalid_destf:

			with open(src, 'r') as f:
				stage = READING_REFERRER

				# a request is [timestamp, referrer, target, direction, agent]
				request = [None, None, None, None, None]
		
				# discard the header line
				f.readline()

				out_of_sync = False

				prev = None
				prevprev = None
				for rawline in f:
					line = rawline.strip()
					if line in SKIP_LINES:
						continue

					# if somehow we got out of sync reading the file
					# try to find a referrer line
					if out_of_sync:
						stage = READING_REFERRER

					# reading the first line of a request consisting of: XXXXAD[R] where
					# XXXX is the timestamp in little endian order
					# A is the agent and can be either 'B' for browser or '?' for unknown
					# D is the direction, 'I' for traffic going into IU, 'O' for traffic going outside IU
					# R is the referrer
					if stage == READING_REFERRER:
						# it's possible to have new lines between records
						if line == '':
							continue

						for s in ['BI', 'BO', '?I', '?O']:
							idx = line.find(s)
							if idx != -1:
								out_of_sync = False
								try: 
									request[TIMESTAMP] = struct.unpack('<I', line[:idx])[0]
								except:
									request[TIMESTAMP] = time.mktime(file_dt.timetuple())
								request[AGENT] = line[idx]
								request[DIRECTION] = line[idx+1]
								request[REFERRER] = normalize_url(line[idx+2:]).split('/')[0]
								break
							else:
								out_of_sync = True
						stage = READING_TARGET
				
					# reading the requested host
					elif stage == READING_TARGET:
						request[TARGET] = normalize_url(line.split('/')[0])
						stage = READING_FILEPATH
			
					# reading the requested file
					# after this step, the reading of the request is done and it
					# can be matched to any supplied criteria
					elif stage == READING_FILEPATH:
						is_valid = True
						for val in request:
							if val is None:
								is_valid = False
								break

						if is_valid:
							dt = datetime.datetime.fromtimestamp(request[TIMESTAMP])

							# if the difference between the file's and the record's timestamps
							# is more than one day, use the file time stamp
							tdelta = dt - file_dt
							if tdelta.days < 0:
								tdelta = -tdelta
							if tdelta.seconds / 60.0 / 60.0 > 1:
								request[TIMESTAMP] = time.mktime(file_dt.timetuple())

							is_valid = (request[AGENT] == 'B' or request[AGENT] == '?') and \
								(request[DIRECTION] == 'I' or request[DIRECTION] == 'O')

						if is_valid:
							if request[AGENT] == 'B' and request[DIRECTION] == 'O':
								valid_writer.writerow([
									request[TIMESTAMP], 
									request[REFERRER], 
									request[TARGET]
								])
						else:
							invalid_destf.write("%s\n" % prevprev)
							invalid_destf.write("%s\n" % prev)
							invalid_destf.write("%s\n" % line)

						# reset the variables describing the request
						request = [None, None, None, None, None]
						stage = READING_REFERRER
				
					# we should never get here
					else:
						raise ValueError("Invalid stage: %d" % (stage))

					prevprev = prev
					prev = line