def get_comment_info(): global info_list pagenum = 1 while(True): print(pagenum) url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg" querystring = {"biztype":"1","topid":"237773700","cmd":"8","pagenum":pagenum,"pagesize":"25"} response = requests.request("GET", url, params=querystring) resp = json.loads(response.text) commentlist = resp.get('comment').get('commentlist') if not commentlist or len(commentlist) == 0: return for comment in commentlist: info = [] one_name = comment.get('nick') # 将 UNIX 时间戳转化为普通时间格式 if comment.get('time') < 1568735760: return time_local = time.localtime(comment.get('time')) one_time = time.strftime("%Y-%m-%d %H:%M:%S", time_local) one_praisenum = comment.get('praisenum') one_comment = comment.get('rootcommentcontent') ILLEGAL_CHARACTERS_RE.sub(r'', one_comment) ILLEGAL_CHARACTERS_RE.sub(r'', one_name) info = [one_name, one_time, one_praisenum, one_comment] # print(info) info_list.append(info) pagenum += 1
def export_csv(self): # 导出成csv格式 # 获取列名作为标题 self.conn.cursorclass = pymysql.cursors.DictCursor with self.conn.cursor() as cursor: cursor.execute(self.sql) title = [] for column_name in cursor.fetchone(): title.append(column_name) with open(r'%s' % self.file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = title writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() # 获取数据,并写入到表格 if self.affected_row <= 100000: # 当导出数据量小于10W时,使用fetchall直接读取到内存中 self.conn.cursorclass = pymysql.cursors.DictCursor with self.conn.cursor() as cursor: msg = f'正在导出SQL:{self.sql}' self.pull_msg(msg) self.execute_log.append(msg) cursor.execute(self.sql) rows = cursor.fetchall() msg = f'正在处理数据\n编码为:UTF-8' self.pull_msg(msg) self.execute_log.append(msg) for row in rows: # 过滤掉特殊字符 for k, v in row.items(): filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub( r'', v) if isinstance(v, str) else v row[k] = filter_illegal_characters_value writer.writerow(row) elif self.affected_row > 100000: # 当导出数据量大于10W时,使用SSCursor进行迭代读取 self.conn.cursorclass = pymysql.cursors.SSDictCursor with self.conn.cursor() as cursor: msg = f'正在导出SQL:{self.sql}' self.pull_msg(msg) self.execute_log.append(msg) cursor.execute(self.sql) while True: row = cursor.fetchone() if row: # 过滤掉特殊字符 for k, v in row.items(): filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub( r'', v) if isinstance(v, str) else v row[k] = filter_illegal_characters_value writer.writerow(row) else: break
def remove_illgeal_characters(s): if not s: return u'' # never 502 # noinspection PyBroadException try: try: return ILLEGAL_CHARACTERS_RE.sub(r'', s) except UnicodeEncodeError: return ILLEGAL_CHARACTERS_RE.sub(r'', s.decode('utf-8')) except Exception: return u''
def write_excel(schools, containers): # containers = ILLEGAL_CHARACTERS_RE.sub(r'', containers) print('正在保存文件.......') wb = Workbook() sheet0 = wb.create_sheet('学校统计数据', index=0) head = [ '学校', '学校简介', '录取规则', '奖学金设置', '食宿条件', '联系方法', '收费项目', '毕业生就业', '体检要求', '其他' ] col0 = schools print('学校总数:', len(col0)) for i, item in enumerate(head): sheet0.cell(row=1, column=i + 1, value=item) # 表头 for i in range(len(col0)): #2820 sheet0.cell(row=i + 2, column=1, value=col0[i]) sheet0.cell(row=i + 2, column=2, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][0]))) sheet0.cell(row=i + 2, column=3, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][1]))) sheet0.cell(row=i + 2, column=4, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][2]))) sheet0.cell(row=i + 2, column=5, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][3]))) sheet0.cell(row=i + 2, column=6, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][4]))) sheet0.cell(row=i + 2, column=7, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][5]))) sheet0.cell(row=i + 2, column=8, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][6]))) sheet0.cell(row=i + 2, column=9, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][7]))) sheet0.cell(row=i + 2, column=10, value=ILLEGAL_CHARACTERS_RE.sub(r'', str(containers[i][8]))) #保存文件 wb.save('学校统计数据.xlsx')
def text_predict(examples, model, tokenizer, id2label): model.eval() labels_list = [] for text in examples: text = str(text).strip() sentencses = ILLEGAL_CHARACTERS_RE.sub(r'', text) sequence_dict = tokenizer.encode_plus(sentencses, max_length=args.max_length, pad_to_max_length=True, truncation=True) token_ids = sequence_dict['input_ids'] token_mask = sequence_dict['attention_mask'] token_segment_type = tokenizer.create_token_type_ids_from_sequences(token_ids_0=token_ids[1:-1]) token_ids = torch.LongTensor(token_ids).unsqueeze(0) token_mask = torch.LongTensor(token_mask).unsqueeze(0) token_segment_type = torch.LongTensor(token_segment_type).unsqueeze(0) with torch.no_grad(): inputs = { 'input_ids': token_ids, 'token_type_ids': token_segment_type, 'attention_mask': token_mask } logits = model(**inputs) _, predict = logits[0].max(1) label = id2label[predict.item()] labels_list.append(label) return labels_list
def get_app_des(app_id, country): data = {'appid': app_id, 'country': country} params = { 'analysis': analysis_parser(data, url='/app/baseinfo'), 'appid': data['appid'], 'country': data['country'] } res = requests.get(url='https://api.qimai.cn/app/baseinfo', params=params, headers=headers) html = res.text html = json.loads(html) # print(html) description = html['description'] description = re.sub(r'<br />', '', description) description = ILLEGAL_CHARACTERS_RE.sub(r'', description) appinfo = str(html['appInfo']) try: yy = re.findall(r"'name': '支持语言', 'value': '(.*?)'}", appinfo)[0] except: yy = '' try: start_date = re.findall(r"'name': '发布日期', 'value': '(.*?)'}", appinfo)[0] except: start_date = '' return (yy, description, start_date)
def getComment(driver, url, song): driver.get(url) sleep(1) driver.switch_to.frame('g_iframe') try: client = driver.find_elements_by_xpath( '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]/div[1]/div/a' ) time = driver.find_elements_by_xpath( '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]/div[@class="rp"]/div' ) thumbs = driver.find_elements_by_xpath( '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]/div[@class="rp"]/a[1]' ) text = driver.find_elements_by_xpath( '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]' ) except: print('error') return None print('正在爬取:' + song) li = [] for n, i in enumerate(time): if n >= 15: break if re.search(r'[(](.*)[)]', thumbs[n].text) == None: thumb = '0' else: thumb = re.findall(r'[(](.*)[)]', thumbs[n].text)[0] if re.search(r'万', thumb) != None: thumb = thumb.split('万')[0] thumb = int(float(thumb) * 10000) else: thumb = int(thumb) comment = { '歌曲': song, '用户': client[n].text, '内容': ILLEGAL_CHARACTERS_RE.sub( r'', re.findall(r':(.*)' + i.text, text[n].get_attribute('textContent'))[0]), '时间': i.text, '点赞数': thumb, '类型': '评论', '关联评论': '无' } if re.search(r'◆◆', comment['内容']) != None: tu = re.findall(r'(.*)◆◆(.*)', comment['内容'])[0] comment['内容'] = tu[0] comment['类型'] = '回复' comment['关联评论'] = tu[1] li.append(comment) return li
def __init__(self, file_path, tokenizer: BertTokenizer, max_length=512, device=None): news_type = [] news_content = [] news_atten_mask = [] seq_typ_ids = [] with open(file_path, mode='r', encoding='utf8') as f: for line in tqdm(f.readlines()): line = line.strip() line = line.split('\t') news_type.append(news_type2id_dict[line[0]]) token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub( r'', line[1]), max_length=max_length, pad_to_max_length=True) news_content.append(token_ids) news_atten_mask.append(get_atten_mask(token_ids)) seq_typ_ids.append( tokenizer.create_token_type_ids_from_sequences( token_ids_0=token_ids[1:-1])) self.label = torch.from_numpy(np.array(news_type)).unsqueeze(1).long() self.token_ids = torch.from_numpy(np.array(news_content)).long() self.seq_type_ids = torch.from_numpy(np.array(seq_typ_ids)).long() self.atten_masks = torch.from_numpy(np.array(news_atten_mask)).long() if device is not None: self.label = self.label.to(device) self.token_ids = self.token_ids.to(device) self.seq_type_ids = self.seq_type_ids.to(device) self.atten_masks = self.atten_masks.to(device)
def _exprt_to_csv(self, cnx): # 导出csv的文件 # 推送消息 msg = f'正在执行导出SQL: {self.sql} \n' self.pm.pull(msg=msg) self.execute_log.append(msg) # 打开csv文件 with open(self.tmp_file, 'w', newline='', encoding='utf-8') as csvfile: # 使用游标读取数据,避免数据量过大产生OOM cnx.cursorclass = pymysql.cursors.SSDictCursor with cnx.cursor() as cursor: cursor.execute(self.sql) # 推送消息 msg = f'正在处理并生成CSV数据 \n' self.pm.pull(msg=msg) self.execute_log.append(msg) # 标题 fieldnames = ([x[0] for x in cursor.description]) # 返回行数 self.result['affected_rows'] = cursor.rownumber # 实例化csv writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() # 操作数据 while True: row = cursor.fetchone() if not row: break # 过滤掉特殊字符 for k, v in row.items(): filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub(r'', v) \ if isinstance(v, str) else v row[k] = filter_illegal_characters_value writer.writerow(row)
def insert_to_excel(self, results, tags): wb = Workbook(optimized_write=True) ws = [] for i in range(len(tags)): ws.append(wb.create_sheet(title=tags[i])) for i in range(len(tags)): ws[i].append(['序号', '书名', '评分', '评价人数', '作者/译者/出版社/出版时间/价格', '链接']) count = 1 for booklist in results[i]: try: book_list = [] for each in booklist: each = ILLEGAL_CHARACTERS_RE.sub(r'', each) book_list.append(each) if book_list: ws[i].append([ count, book_list[0], float(book_list[1]), int(book_list[2]), book_list[3], book_list[4] ]) count += 1 else: print('Fail to save i:', i, book_list) except: print('Fail to save i:', i, booklist) pass save_path = 'book_lists' for i in range(len(tags)): save_path += ('-' + tags[i]) save_path += '.xlsx' wb.save(save_path)
def _sanitize_value(self, raw_value): # prepend ' if raw_value is starting with possible malicious char if self.sanitize_fields and raw_value: str_value = str(raw_value) str_value = ILLEGAL_CHARACTERS_RE.sub('', str_value) # remove ILLEGAL_CHARACTERS so it doesn't crash return "'" + str_value if str_value.startswith(ESCAPE_CHARS) else str_value return raw_value
def insert_to_excel(self, results, tags): wb = Workbook(optimized_write=True) ws = [] for i in range(len(tags)): ws.append(wb.create_sheet(title=tags[i])) for i in range(len(tags)): ws[i].append( ['序号', '电影名', '评分', '评价人数', '制片地区/类型/上映年份/导演/主演', '链接']) count = 1 for movielist in results[i]: try: movie_list = [] for each in movielist: each = ILLEGAL_CHARACTERS_RE.sub(r'', each) movie_list.append(each) if movie_list: ws[i].append([ count, movie_list[0], float(movie_list[1]), int(movie_list[2]), movie_list[3], movie_list[4] ]) count += 1 else: print('Fail to save i:', i, movie_list) except: print('Fail to save i:', i, movielist) pass save_path = 'movie_lists' for i in range(len(tags)): save_path += ('-' + tags[i]) save_path += '.xlsx' wb.save(save_path)
def predict(text): global index text = str(text).strip() token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub(r'', text), max_length=256, pad_to_max_length=True) token_mask = get_atten_mask(token_ids) token_segment_type = tokenizer.create_token_type_ids_from_sequences( token_ids_0=token_ids[1:-1]) token_ids = torch.LongTensor(token_ids).unsqueeze(0) token_mask = torch.LongTensor(token_mask).unsqueeze(0) token_segment_type = torch.LongTensor(token_segment_type).unsqueeze(0) inputs = { 'input_ids': token_ids, 'token_type_ids': token_segment_type, 'attention_mask': token_mask, # 'labels': batch[0] } logits = classifier(**inputs) _, predict = logits[0].max(1) # print(str(index) + news_id_type_dict[predict.item()]) index += 1 return news_id_type_dict[predict.item()]
def write_sheet(self, sheet_name, sheet): worksheet = odf.table.Table(name=sheet_name) sheet_header = list(sheet) header_row = odf.table.TableRow() for header in sheet_header: header_row.addElement(self._make_cell(header)) worksheet.addElement(header_row) for sheet_line in sheet.lines: row = odf.table.TableRow() for header in sheet_header: value = sheet_line.get(header) if isinstance(value, str): new_value = ILLEGAL_CHARACTERS_RE.sub("", value) if new_value != value: warn( _("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed" ).format(value), DataErrorWarning, ) value = new_value row.addElement(self._make_cell(value)) worksheet.addElement(row) self.workbook.spreadsheet.addElement(worksheet)
def sanitize_value(value): # prepend ' if value is starting with possible malicious char if value: str_value = str(value) str_value = ILLEGAL_CHARACTERS_RE.sub( "", str_value) # remove ILLEGAL_CHARACTERS so it doesn't crash return "'" + str_value if str_value.startswith( ESCAPE_CHARS) else str_value return value
def news_data_process(file_path): news_type = [] news_content = [] with open(file_path, mode='r', encoding='utf8') as f: for line in f.readlines(): line = line.strip() line = line.split('\t') news_type.append(line[0]) news_content.append(ILLEGAL_CHARACTERS_RE.sub(r'', line[1])) return news_type, news_content
def write_row(self, row): self.row_count += 1 column_count = 0 for cell_value in row: # 处理非法字符 column_count += 1 cell_value = ILLEGAL_CHARACTERS_RE.sub(r'', cell_value) self.ws.cell(row=self.row_count, column=column_count, value=cell_value)
def _export_to_xlsx(self, cnx): # 导出xlsx格式的文件,使用wirte_only能够有效降低内存的使用 wb = Workbook(write_only=True) wb.encoding = self.encoding ws = wb.create_sheet() ws.title = self.title # 推送消息 msg = f'正在执行导出SQL: {self.sql} \n' self.pm.pull(msg=msg) self.execute_log.append(msg) # 使用游标读取数据,避免数据量过大产生OOM if not self.clickhouse: cnx.cursorclass = pymysql.cursors.SSCursor with cnx.cursor() as cursor: if self.clickhouse: cursor.set_stream_results(True, 1000) cursor.execute(self.sql) # 推送消息 msg = f'正在处理并生成XLSX数据 \n' self.pm.pull(msg=msg) self.execute_log.append(msg) # 标题 if self.clickhouse: header = handle_duplicate_column( [x.name for x in cursor.description]) else: header = [x[0] for x in cursor.description] ws.append(header) # 返回行数 if self.clickhouse: self.result['affected_rows'] = cursor.rowcount else: self.result['affected_rows'] = cursor.rownumber # 操作数据 while True: row = cursor.fetchone() if not row: break # 过滤掉特殊字符 filter_illegal_characters_row = list( map((lambda x: ILLEGAL_CHARACTERS_RE.sub(r'', x) if isinstance(x, str) else x), row)) # 处理科学计数法 _row = [ self.correct_int_row(x) for x in filter_illegal_characters_row ] ws.append(_row) # 保存到文件 wb.save(self.tmp_file)
def main(wb, session, OrderModel, WebsiteModel): print('开始爬取码市订单') sheet = wb['Sheet'] sheet.title = '码市' sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count = 1 website = session.query(WebsiteModel).get(1) for i in range(10, 0, -1): url = 'https://codemart.com/api/project?page=%d' % i result = get_one_page(url) if isinstance(result, list): for r in result: time_stamp = int(r['pubtime']) / 1000 publish_time = datetime.fromtimestamp(time_stamp) if publish_time < time_point: continue desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description']) cid = 'cm-{}'.format(r['id']) contact = get_contact(desc) link = 'https://codemart.com/project/{}'.format(r['id']) is_valid = True if r['status'] == '招募中' else False order_query = session.query(OrderModel).get(cid) if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrderModel(id=cid, desc=desc, link=link, contact=contact, category=r['cate'], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 session.commit() elif isinstance(result, tuple): message = '码市爬取第%d行出错:%s' % (result[0], result[1]) print(message) send_message(message) print('结束爬取码市订单')
def writeFile(data, file): wb1 = Workbook() # print(wb1.get_sheet_names()) # 新建了一个工作表,尚未保存 sheet = wb1.active sheet.title = 'Sheet1' # 直接给单元格赋值 # sheet.append(["fid", "fvipid", 'fadd_time', "fadddr"]) sheet.append(["fadddr"]) for item in data: # sheet.append([item.get("fid"), item.get("fvipid"), item.get('fadd_time'), ILLEGAL_CHARACTERS_RE.sub(r'', item.get("faddr"))]) sheet.append([ILLEGAL_CHARACTERS_RE.sub(r'', item.get("faddr"))]) wb1.save(file)
def write_content(path, ranking_sheet_name, ranking_content, reviews_sheet_names, reviews_content): try: workbook = openpyxl.load_workbook(path) except: workbook = openpyxl.Workbook() sheet_name = format_sheet_name(ranking_sheet_name) try: sheet = workbook[sheet_name] except: sheet = workbook.create_sheet(sheet_name) for i in range(len(ranking_content)): for j in range(0, len(ranking_content[i])): tem_value = ILLEGAL_CHARACTERS_RE.sub(r'', str(ranking_content[i][j])) sheet.cell(row=i + 1, column=j + 1, value=tem_value) for index in range(len(reviews_content)): values = reviews_content[index] if values is None: continue sheet_name = format_sheet_name(reviews_sheet_names[index]) try: sheet = workbook[sheet_name] except: sheet = workbook.create_sheet(sheet_name) for i in range(0, len(values)): for j in range(0, len(values[i])): tem_value = ILLEGAL_CHARACTERS_RE.sub(r'', str(values[i][j])) sheet.cell(row=i + 1, column=j + 1, value=tem_value) try: if workbook.sheetnames[0] == 'Sheet': sheet = workbook[workbook.sheetnames[0]] workbook.remove(sheet) except: print(path, "Write fail") pass print(path, "Write success") workbook.save(path) workbook.close()
def _create_examples(self, path): examples = [] with open(path, mode='r', encoding='utf8') as f: for line in tqdm(enumerate(f.readlines())): id = line[0] line = line[1].strip() line = line.split('\t') label = line[0] text = line[1] text = ILLEGAL_CHARACTERS_RE.sub(r'', text) example = InputExample(guid=id, text_a=text, label=label) examples.append(example) return examples
def crawler(): while not work.empty(): # 队列不为空时 url = work.get_nowait() res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') movie_list = soup.find(id='asyncRatingRegion') movies = movie_list.find_all('li') for movie in movies: name = movie.find('a') num = movie.find('div') info = movie.find_all('p') print('No.' + num.text) print('电影名' + name['title']) # 由于有些电影没有导演、演员、简介,所以加入try语句 try: director = info[0].find('a') dir_ = ILLEGAL_CHARACTERS_RE.sub(r'', director.text) # 可能有非法字符 except Exception: dir_ = '无' try: actor = info[1].find('a') actor_ = ILLEGAL_CHARACTERS_RE.sub(r'', actor.text) except Exception: actor_ = '无' try: brief = info[2] brief_ = ILLEGAL_CHARACTERS_RE.sub(r'', brief.text) except Exception: brief_ = '无' movie_excel.append([num.text, name['title'], dir_, actor_, brief_]) print('-------------\n')
def insert(result_list, title): wb = openpyxl.Workbook() ws = wb.active ws['A1'] = '分类' ws['B1'] = '地址' ws['C1'] = '标题' ws['D1'] = '数量' for result in result_list: result1 = [ILLEGAL_CHARACTERS_RE.sub(r'', str(i)) for i in result] result1[-1] = int(result1[-1]) ws.append(result1) path = app_path() + '\\' + title + '.xlsx' wb.save(path)
def _exprt_to_csv(self, cnx): # 导出csv的文件 # 推送消息 msg = f'正在执行导出SQL: {self.sql} \n' self.pm.pull(msg=msg) self.execute_log.append(msg) # 打开csv文件 with open(self.tmp_file, 'w', newline='', encoding='utf-8') as csvfile: # 使用游标读取数据,避免数据量过大产生OOM if not self.clickhouse: cnx.cursorclass = pymysql.cursors.SSDictCursor with cnx.cursor() as cursor: if self.clickhouse: cursor.set_stream_results(True, 1000) cursor.execute(self.sql) # 推送消息 msg = f'正在处理并生成CSV数据 \n' self.pm.pull(msg=msg) self.execute_log.append(msg) # 标题 if self.clickhouse: header = handle_duplicate_column( [x.name for x in cursor.description]) else: header = ([x[0] for x in cursor.description]) fieldnames = header # 返回行数 if self.clickhouse: self.result['affected_rows'] = cursor.rowcount else: self.result['affected_rows'] = cursor.rownumber # 实例化csv writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() # 操作数据 while True: row = cursor.fetchone() if not row: break # 过滤掉特殊字符 if self.clickhouse: # clickhouse流式不支持字典 row = dict(zip(header, row)) for k, v in row.items(): filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub(r'', v) \ if isinstance(v, str) else v row[k] = filter_illegal_characters_value writer.writerow(row)
def write_sheet(self, sheet_name, sheet): sheet_header = list(sheet) worksheet = self.workbook.create_sheet() worksheet.title = self.sheet_prefix + sheet_name worksheet.append(sheet_header) for sheet_line in sheet.lines: line = [] for header in sheet_header: value = sheet_line.get(header) if isinstance(value, six.text_type): new_value = ILLEGAL_CHARACTERS_RE.sub('', value) if new_value != value: warn("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed".format(value), DataErrorWarning) value = new_value line.append(value) worksheet.append(line)
def parseHtml(html): html = html.replace('\\', '') html = ILLEGAL_CHARACTERS_RE.sub(r'', html) s = json.loads(html, strict=False) global line_index userInfo = [] for key in s['userInfo']: line_index = line_index + 1 a = (key['uid'], key['nickname'], key['age'], key['work_location'], key['height'], key['education'], key['matchCondition'], key['marriage'], key['shortnote'].replace('\n', ' ')) userInfo.append(a) with open('sjjy.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerows(userInfo)
def parse_pkg_html(file, app_infos): # html.parser是解析器,也可是lxml soup = BeautifulSoup(open(file, encoding='utf-8'), features='html.parser', from_encoding='UTF-8') table = soup.table if table is not None: # tbody节点 tbody = table.tbody if tbody is not None: # tr节点 for tr in tbody.find_all('tr'): # td节点 tds = tr.find_all('td') if len(tds) > 1: # UID app_id = str( str(tds[0]).split('<br', 1)[0].split('">', 1)[1].strip()) uid = int(str(tds[1].a.string).strip()) name = str(tds[2].a.contents[2].replace('[', '')).strip() domain = tds[2].a.attrs['href'].strip() domain = ILLEGAL_CHARACTERS_RE.sub(r'', domain) create_time = str(tds[3].string).strip() if tds[4].string is None: end_time = str(tds[4].font.string).strip() else: end_time = str(tds[4].string).strip() if tds[5].a is None: delivery = '' else: delivery = str(tds[5].a.string).strip() app_info = AppInfo(app_id=app_id, uid=uid, name=name, domain=domain, create_time=create_time, end_time=end_time, delivery=delivery) # print(app_info) app_infos.append(app_info)
def export_xlsx(self): # 导出成xlsx格式 # num:保存文件的结尾_num标识,为str类型 wb = Workbook() wb.encoding = f'{self.encoding}' ws = wb.active ws.title = self.title # 获取列名作为标题 self.conn.cursorclass = pymysql.cursors.DictCursor with self.conn.cursor() as cursor: cursor.execute(self.sql) title = [] for column_name in cursor.fetchone(): title.append(column_name) ws.append(title) # 获取数据,并写入到表格 # 使用SSCursor # Unbuffered Cursor, # mainly useful for queries that return a lot of data, # or for connections to remote servers over a slow network. self.conn.cursorclass = pymysql.cursors.SSCursor with self.conn.cursor() as cursor: msg = f'正在导出SQL:{self.sql}\n' self.execute_log.append(msg) pull_msg = {'status': 3, 'data': msg} async_to_sync(channel_layer.group_send)(self.user, {"type": "user.message", 'text': json.dumps(pull_msg)}) cursor.execute(self.sql) while True: row = cursor.fetchone() if row: # 过滤掉特殊字符 filter_illegal_characters_row = list( map( (lambda x: ILLEGAL_CHARACTERS_RE.sub(r'', x) if isinstance(x, str) else x), row ) ) ws.append(filter_illegal_characters_row) else: break wb.save(self.file) self.compress_file()
def export_field(self, field, obj): """override export_field() to translate field values.""" field_name = self.get_field_name(field) method = getattr(self, 'dehydrate_%s' % field_name, None) if method is not None: return method(obj) field_model = Aid._meta.get_field(field.column_name) if field_model.serialize: # simple fields with choices: use get_FOO_display to translate if field_model.choices: value = getattr(obj, f'get_{field.column_name}_display')() return field.widget.render(value, obj) # For Text and Char fields, we remove illegal characters elif isinstance(field_model, (TextField, CharField)): export_value = field.export(obj) export_value = ILLEGAL_CHARACTERS_RE.sub('', export_value) return export_value # ChoiceArrayField fields: need to translate a list elif hasattr(field_model, 'base_field') and field_model.base_field.choices: # noqa value_raw = field.get_value(obj) if value_raw: # translate each dict choice value = [dict(field_model.base_field.choices).get(value, value) for value in value_raw] # noqa return field.widget.render(value, obj) # BooleanField fields: avoid returning 1 (True) and 0 (False) elif field_model.get_internal_type() == 'BooleanField': value_raw = field.get_value(obj) if value_raw is not None: return 'Oui' if value_raw else 'Non' # subvention_rate if field.column_name == 'subvention_rate': if field.get_value(obj) is None: return '' else: lower = field.get_value(obj).lower or '' upper = field.get_value(obj).upper or '' return f'[{lower}, {upper})' return field.export(obj)
def write_excel(title, distStr): if not os.path.exists('region.xlsx'): wb = openpyxl.Workbook() wb.save("region.xlsx") info = get_project_list_by_region(distStr) wb = openpyxl.load_workbook('region.xlsx') ws = wb.create_sheet(title=title) title = [ 'PROJ_NAME', 'PRV', 'IVALUE', 'INVESTCOUNT', 'PROJ_STATE_NAME', 'START_TIME', 'Project_demonstration_level', 'RETURN_MODE_NAME', 'ESTIMATE_COPER', 'OPERATE_MODE_NAME', 'Ways_of_purchasing_social_capital' ] ws.append(title) for i in info: cell_info = [] for t in title: text = ILLEGAL_CHARACTERS_RE.sub(r'', str(i[t])) cell_info.append(text) print(cell_info) ws.append(cell_info) wb.save("region.xlsx")
def sanitize(val): if not isinstance(val, text_type): return val return ILLEGAL_CHARACTERS_RE.sub("", val)