resp = session.get(url) logger.info('Parsing content') parsed_html = bs4.BeautifulSoup(resp.text, 'html.parser') # Note: There should be only one element called div.desc # but there's no guarantee pages_text = parsed_html.find('div', class_='desc').get_text() config['num_pages'] = int(re.search('Page 1 of ([0-9]+)', pages_text).group(1)) print 'Found {0} pages'.format(config['num_pages']) if args.start > config['num_pages']: print 'Start page', args.start, 'is greater than found pages:', config['num_pages'] print 'Setting start pages to last page' args.start = config['num_pages'] imdb_all = [] username = os.path.splitext(os.path.basename(args.outfile))[0] with codecs.open(args.outfile, 'wb') as outfile: w = UnicodeWriter(outfile) # Only output header if file didn't exist w.writerow(['position','const','created','modified','description','Title','Title type','Directors', '{0} rated'.format(username),'IMDb Rating','Runtime (mins)','Year','Genres','Num. Votes', 'Release Date (month/day/year)','URL']) for page in get_start_positions(config['num_pages'], args.start): pool.spawn(download_page, page[0], page[1]) pool.join() with codecs.open(args.outfile, 'ab') as outfile: w = UnicodeWriter(outfile) w.writerows(imdb_all) end_time = time.time() print 'Downloaded', len(imdb_all), 'ratings in', pretty_seconds(end_time - start_time) logger.info('Downloaded %s ratings in %s', len(imdb_all), pretty_seconds(end_time - start_time)) print 'Saved results in', args.outfile
def handle(self): """ Reply to incoming requests. Request: <COMMAND> <FILTER-TYPE> [SUBSTRING ...] COMMAND: ALL|OPEN|INGAME FILTER-TYPE: NONE|MOD|HOST|DESC SUBSTRING: The text to look for in the column FILTER-TYPE. If space[s] is encountered, each word must be in the field (AND). If '|' is encountered, word[s] before and after it will be searched for separately and all results will be returned (OR). Reply: 1st line: 'START <ISO 8601 timestamp, UTC>' 2nd: List of hosts as an UTF-8 encoded CSV using ; as separator and quoting every field. The list will be filtered if FILTER-TYPE != NONE. 3rd: 'END <length of list>' """ try: for line in self.rfile: # loop until disconnect or server shutdown if self.server.shutdown_now: logger.info("(%s:%d) server shut down already, bye bye", self.client_address[0], self.client_address[1]) self.finish() return # remote sockets are not always closed, kill myself after MAX_CONNECTION_LENGTH seconds if datetime.datetime.now( ) - self.thread.start_time > datetime.timedelta( seconds=MAX_CONNECTION_LENGTH): logger.info( "(%s:%d) Running since %s (>%d sec) in thread %s, killing myself.", self.client_address[0], self.client_address[1], self.thread.start_time.strftime("%Y-%m-%d %H:%M:%S"), MAX_CONNECTION_LENGTH, self.thread.name) self.finish() return self.server.query_stats_add(line) line = line.split() if len(line) < 2 or (len(line) == 2 and line[1] != "NONE"): logger.error("(%s:%d) Format error: '%s'", self.client_address[0], self.client_address[1], line) continue # COMMAND if line[0] == "ALL": host_list = self.hosts.values() elif line[0] == "OPEN": host_list = self.hosts_open.values() elif line[0] == "INGAME": host_list = self.hosts_ingame.values() else: logger.error("(%s:%d) Unknown COMMAND '%s'.", self.client_address[0], self.client_address[1], line[0]) continue # FILTER-TYPE if line[1] == "NONE": host_list_filtered = host_list elif line[1] == "MOD": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([ host for host in host_list if substr_search(words, host.gameName) ]) elif line[1] == "HOST": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([ host for host in host_list if substr_search(words, host.founder) ]) else: logger.error("(%s:%d) Unknown FILTER-TYPE '%s'.", self.client_address[0], self.client_address[1], line[0]) continue response = u"START %s\n" % datetime.datetime.utcnow( ).isoformat() if len(host_list_filtered) > 0: csvfile = cStringIO.StringIO() csvwriter = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL) csvwriter.writerow(host_list_filtered[0].as_list_header()) csvwriter.writerows( [host.as_list() for host in host_list_filtered]) response += csvfile.getvalue() csvfile.close() response += u"END %d\n" % len(host_list_filtered) self.wfile.write(response) except socket.error, so: # client disconnected. that's OK, thread will terminate now logger.debug( "(%s:%d) client disconnected after %0.1f min", self.client_address[0], self.client_address[1], (datetime.datetime.now() - self.thread.start_time).seconds / 60.0) self.finish() return
def handle(self): """ Reply to incoming requests. Request: <COMMAND> <FILTER-TYPE> [SUBSTRING ...] COMMAND: ALL|OPEN|INGAME FILTER-TYPE: NONE|MOD|HOST|DESC SUBSTRING: The text to look for in the column FILTER-TYPE. If space[s] is encountered, each word must be in the field (AND). If '|' is encountered, word[s] before and after it will be searched for separately and all results will be returned (OR). Reply: 1st line: 'START <ISO 8601 timestamp, UTC>' 2nd: List of hosts as an UTF-8 encoded CSV using ; as separator and quoting every field. The list will be filtered if FILTER-TYPE != NONE. 3rd: 'END <length of list>' """ try: for line in self.rfile: # loop until disconnect or server shutdown if self.server.shutdown_now: logger.info("(%s:%d) server shut down already, bye bye", self.client_address[0], self.client_address[1]) self.finish() return # remote sockets are not always closed, kill myself after MAX_CONNECTION_LENGTH seconds if datetime.datetime.now() - self.thread.start_time > datetime.timedelta(seconds=MAX_CONNECTION_LENGTH): logger.info("(%s:%d) Running since %s (>%d sec) in thread %s, killing myself.", self.client_address[0], self.client_address[1], self.thread.start_time.strftime("%Y-%m-%d %H:%M:%S"), MAX_CONNECTION_LENGTH, self.thread.name) self.finish() return self.server.query_stats_add(line) line = line.split() if len(line) < 2 or (len(line) == 2 and line[1] != "NONE"): logger.error("(%s:%d) Format error: '%s'", self.client_address[0], self.client_address[1], line) continue # COMMAND if line[0] == "ALL": host_list = self.hosts.values() elif line[0] == "OPEN": host_list = self.hosts_open.values() elif line[0] == "INGAME": host_list = self.hosts_ingame.values() else: logger.error("(%s:%d) Unknown COMMAND '%s'.", self.client_address[0], self.client_address[1], line[0]) continue # FILTER-TYPE if line[1] == "NONE": host_list_filtered = host_list elif line[1] == "MOD": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([host for host in host_list if substr_search(words, host.gameName)]) elif line[1] == "HOST": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([host for host in host_list if substr_search(words, host.founder)]) else: logger.error("(%s:%d) Unknown FILTER-TYPE '%s'.", self.client_address[0], self.client_address[1], line[0]) continue response = u"START %s\n" % datetime.datetime.utcnow().isoformat() if len(host_list_filtered) > 0: csvfile = cStringIO.StringIO() csvwriter = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL) csvwriter.writerow(host_list_filtered[0].as_list_header()) csvwriter.writerows([host.as_list() for host in host_list_filtered]) response += csvfile.getvalue() csvfile.close() response += u"END %d\n" % len(host_list_filtered) self.wfile.write(response) except socket.error, so: # client disconnected. that's OK, thread will terminate now logger.debug("(%s:%d) client disconnected after %0.1f min", self.client_address[0], self.client_address[1], (datetime.datetime.now() - self.thread.start_time).seconds/60.0) self.finish() return