def export_db(self, **kwargs): if kwargs["format"] == "csv": sql = "PRAGMA table_info(GPPT_Submissions)" schema = self.cur.execute(sql).fetchall() colnames = [] for col in schema: colnames.append(col[1]) colnames.remove("Attachment_Binary") colnames.remove("Message_Id") colnames.remove("Attachment_Id") sql = '''SELECT Id, Filename, Submitter, Region, Date, Lead_Office, P_Margin, Tot_Fee, Blended_Rate, Tot_Hours, Hours_Mgr, Hours_SPM, Hours_PM, Hours_Cons, Hours_Assoc, Method, Tool_Version FROM GPPT_Submissions''' results = self.cur.execute(sql).fetchall() with open("GPPT_Submissions.csv", "w") as f: writer = UnicodeWriter(f) writer.writerow(colnames) # for row in results: # utf8_row = [] # for cell in row: # if isinstance(cell, unicode): # s = cell.decode('utf-8') # elif isinstance(cell, int): # s = str(cell) # utf8_row.append(s.encode('utf-8')) for row in results: writer.writerow(row) logging.info("Database dumped to GPPT_Submissions.csv")
def handle(self, *args, **options): if not options.has_key('file'): raise CommandError("An output filename must be specified with -f=") if not options.has_key('city'): raise CommandError("At least one city must be specified with -c=") file = open(options['file'],'w') csv = UnicodeWriter(file) for city_name in options['city'].split(','): print city_name city = City.objects.get(name=city_name) wards = Ward.objects.filter(city=city) for ward in wards: row = [ city.name, ward.name, ward.councillor.first_name, ward.councillor.last_name, ward.councillor.email] csv.writerow(row)
def csv_out(imdb, file): print '\nWriting to', file username = os.path.splitext(os.path.basename(file))[0] with codecs.open(file, 'wb') as outfile: w = UnicodeWriter(outfile) row0 = [] for n in imdb: row0.extend(n.keys()) row0 = set(row0) w.writerow(row0) for n in imdb: row1 = [] for m in row0: v = n.get(m) if (v == None): v = "" row1.append(v) w.writerow(row1)
def csv_out(imdb, file): print '\nWriting to', file username = os.path.splitext(os.path.basename(file))[0] with codecs.open(file, 'wb') as outfile: w = UnicodeWriter(outfile) row0 = [] for n in imdb: row0.extend(n.keys()) row0 = set(row0) w.writerow(row0) for n in imdb: row1 = [] for m in row0: v = n.get(m) if(v == None): v = "" row1.append(v) w.writerow(row1)
def handle(self, *args, **options): if not options.has_key('file'): raise CommandError("An output filename must be specified with -f=") if not options.has_key('city'): raise CommandError("At least one city must be specified with -c=") file = open(options['file'], 'w') csv = UnicodeWriter(file) for city_name in options['city'].split(','): print city_name city = City.objects.get(name=city_name) wards = Ward.objects.filter(city=city) for ward in wards: row = [ city.name, ward.name, ward.councillor.first_name, ward.councillor.last_name, ward.councillor.email ] csv.writerow(row)
def handle(self): """ Reply to incoming requests. Request: <COMMAND> <FILTER-TYPE> [SUBSTRING ...] COMMAND: ALL|OPEN|INGAME FILTER-TYPE: NONE|MOD|HOST|DESC SUBSTRING: The text to look for in the column FILTER-TYPE. If space[s] is encountered, each word must be in the field (AND). If '|' is encountered, word[s] before and after it will be searched for separately and all results will be returned (OR). Reply: 1st line: 'START <ISO 8601 timestamp, UTC>' 2nd: List of hosts as an UTF-8 encoded CSV using ; as separator and quoting every field. The list will be filtered if FILTER-TYPE != NONE. 3rd: 'END <length of list>' """ try: for line in self.rfile: # loop until disconnect or server shutdown if self.server.shutdown_now: logger.info("(%s:%d) server shut down already, bye bye", self.client_address[0], self.client_address[1]) self.finish() return # remote sockets are not always closed, kill myself after MAX_CONNECTION_LENGTH seconds if datetime.datetime.now( ) - self.thread.start_time > datetime.timedelta( seconds=MAX_CONNECTION_LENGTH): logger.info( "(%s:%d) Running since %s (>%d sec) in thread %s, killing myself.", self.client_address[0], self.client_address[1], self.thread.start_time.strftime("%Y-%m-%d %H:%M:%S"), MAX_CONNECTION_LENGTH, self.thread.name) self.finish() return self.server.query_stats_add(line) line = line.split() if len(line) < 2 or (len(line) == 2 and line[1] != "NONE"): logger.error("(%s:%d) Format error: '%s'", self.client_address[0], self.client_address[1], line) continue # COMMAND if line[0] == "ALL": host_list = self.hosts.values() elif line[0] == "OPEN": host_list = self.hosts_open.values() elif line[0] == "INGAME": host_list = self.hosts_ingame.values() else: logger.error("(%s:%d) Unknown COMMAND '%s'.", self.client_address[0], self.client_address[1], line[0]) continue # FILTER-TYPE if line[1] == "NONE": host_list_filtered = host_list elif line[1] == "MOD": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([ host for host in host_list if substr_search(words, host.gameName) ]) elif line[1] == "HOST": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([ host for host in host_list if substr_search(words, host.founder) ]) else: logger.error("(%s:%d) Unknown FILTER-TYPE '%s'.", self.client_address[0], self.client_address[1], line[0]) continue response = u"START %s\n" % datetime.datetime.utcnow( ).isoformat() if len(host_list_filtered) > 0: csvfile = cStringIO.StringIO() csvwriter = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL) csvwriter.writerow(host_list_filtered[0].as_list_header()) csvwriter.writerows( [host.as_list() for host in host_list_filtered]) response += csvfile.getvalue() csvfile.close() response += u"END %d\n" % len(host_list_filtered) self.wfile.write(response) except socket.error, so: # client disconnected. that's OK, thread will terminate now logger.debug( "(%s:%d) client disconnected after %0.1f min", self.client_address[0], self.client_address[1], (datetime.datetime.now() - self.thread.start_time).seconds / 60.0) self.finish() return
resp = session.get(url) logger.info('Parsing content') parsed_html = bs4.BeautifulSoup(resp.text, 'html.parser') # Note: There should be only one element called div.desc # but there's no guarantee pages_text = parsed_html.find('div', class_='desc').get_text() config['num_pages'] = int(re.search('Page 1 of ([0-9]+)', pages_text).group(1)) print 'Found {0} pages'.format(config['num_pages']) if args.start > config['num_pages']: print 'Start page', args.start, 'is greater than found pages:', config['num_pages'] print 'Setting start pages to last page' args.start = config['num_pages'] imdb_all = [] username = os.path.splitext(os.path.basename(args.outfile))[0] with codecs.open(args.outfile, 'wb') as outfile: w = UnicodeWriter(outfile) # Only output header if file didn't exist w.writerow(['position','const','created','modified','description','Title','Title type','Directors', '{0} rated'.format(username),'IMDb Rating','Runtime (mins)','Year','Genres','Num. Votes', 'Release Date (month/day/year)','URL']) for page in get_start_positions(config['num_pages'], args.start): pool.spawn(download_page, page[0], page[1]) pool.join() with codecs.open(args.outfile, 'ab') as outfile: w = UnicodeWriter(outfile) w.writerows(imdb_all) end_time = time.time() print 'Downloaded', len(imdb_all), 'ratings in', pretty_seconds(end_time - start_time) logger.info('Downloaded %s ratings in %s', len(imdb_all), pretty_seconds(end_time - start_time)) print 'Saved results in', args.outfile
def handle(self): """ Reply to incoming requests. Request: <COMMAND> <FILTER-TYPE> [SUBSTRING ...] COMMAND: ALL|OPEN|INGAME FILTER-TYPE: NONE|MOD|HOST|DESC SUBSTRING: The text to look for in the column FILTER-TYPE. If space[s] is encountered, each word must be in the field (AND). If '|' is encountered, word[s] before and after it will be searched for separately and all results will be returned (OR). Reply: 1st line: 'START <ISO 8601 timestamp, UTC>' 2nd: List of hosts as an UTF-8 encoded CSV using ; as separator and quoting every field. The list will be filtered if FILTER-TYPE != NONE. 3rd: 'END <length of list>' """ try: for line in self.rfile: # loop until disconnect or server shutdown if self.server.shutdown_now: logger.info("(%s:%d) server shut down already, bye bye", self.client_address[0], self.client_address[1]) self.finish() return # remote sockets are not always closed, kill myself after MAX_CONNECTION_LENGTH seconds if datetime.datetime.now() - self.thread.start_time > datetime.timedelta(seconds=MAX_CONNECTION_LENGTH): logger.info("(%s:%d) Running since %s (>%d sec) in thread %s, killing myself.", self.client_address[0], self.client_address[1], self.thread.start_time.strftime("%Y-%m-%d %H:%M:%S"), MAX_CONNECTION_LENGTH, self.thread.name) self.finish() return self.server.query_stats_add(line) line = line.split() if len(line) < 2 or (len(line) == 2 and line[1] != "NONE"): logger.error("(%s:%d) Format error: '%s'", self.client_address[0], self.client_address[1], line) continue # COMMAND if line[0] == "ALL": host_list = self.hosts.values() elif line[0] == "OPEN": host_list = self.hosts_open.values() elif line[0] == "INGAME": host_list = self.hosts_ingame.values() else: logger.error("(%s:%d) Unknown COMMAND '%s'.", self.client_address[0], self.client_address[1], line[0]) continue # FILTER-TYPE if line[1] == "NONE": host_list_filtered = host_list elif line[1] == "MOD": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([host for host in host_list if substr_search(words, host.gameName)]) elif line[1] == "HOST": host_list_filtered = list() for words in " ".join(line[2:]).split("|"): host_list_filtered.extend([host for host in host_list if substr_search(words, host.founder)]) else: logger.error("(%s:%d) Unknown FILTER-TYPE '%s'.", self.client_address[0], self.client_address[1], line[0]) continue response = u"START %s\n" % datetime.datetime.utcnow().isoformat() if len(host_list_filtered) > 0: csvfile = cStringIO.StringIO() csvwriter = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL) csvwriter.writerow(host_list_filtered[0].as_list_header()) csvwriter.writerows([host.as_list() for host in host_list_filtered]) response += csvfile.getvalue() csvfile.close() response += u"END %d\n" % len(host_list_filtered) self.wfile.write(response) except socket.error, so: # client disconnected. that's OK, thread will terminate now logger.debug("(%s:%d) client disconnected after %0.1f min", self.client_address[0], self.client_address[1], (datetime.datetime.now() - self.thread.start_time).seconds/60.0) self.finish() return
from bs4 import BeautifulSoup from unicodewriter import UnicodeWriter from urllib import urlopen, urlencode import csv outfile = open('linkedin_job_companyname.csv','w') out_wrtr = UnicodeWriter(outfile) out_wrtr.writerow(['title','company','location','description']) url = 'https://www.linkedin.com/jobs/search?keywords=Tableau&locationId=us:0&orig=JSERP&count=50&' #https://www.linkedin.com/jobs/search?keywords=Tableau&locationId=us:0&orig=JSERP&start=0&count=50 data = {} for i in range(25): data['start'] = i*50 data_url = url + urlencode(data) soup = BeautifulSoup(urlopen(data_url),'lxml') for item in soup.findAll('li','job-listing'): title = item.find('span','job-title-text').text company = item.find('span','company-name-text').text location = item.find('span','job-location').find('span').text description = item.find('div','job-description').text print 'Running: ' + str(i*50) + ' page.' print '#'.join([title, company, location, description]) print '\n' out_wrtr.writerow([title, company, location, description]) outfile.close()