def test_open_too_many_files(): test_filename = _get_test_filename() wb = Workbook(optimized_write=True) for i in range(200): # over 200 worksheets should raise an OSError ('too many open files') wb.create_sheet() wb.save(test_filename) os.remove(test_filename)
def process_dir(data_dir_path, output_file_path): # crate a new workbook and sheets wb_new = Workbook() ws_deliveries = wb_new.get_active_sheet() ws_deliveries.title = 'Deliveries' ws_returns = wb_new.create_sheet(1) ws_returns.title = 'Returns' ws_wastage = wb_new.create_sheet(2) ws_wastage.title = 'Wastage' ws_staff_meals = wb_new.create_sheet(3) ws_staff_meals.title = 'Staff Meals' ws_transfers_in = wb_new.create_sheet(4) ws_transfers_in.title = 'Transfers In' ws_transfers_out = wb_new.create_sheet(5) ws_transfers_out.title = 'Transfers Out' # get the list of files in the directory onlyfiles = [ f for f in listdir(data_dir_path) if isfile(join(data_dir_path,f))] # process each file for f in onlyfiles: process_file(data_dir_path + f, wb_new) # save the new workbook wb_new.save(output_file_path)
def test_get_sheet_names(): wb = Workbook() names = ['Sheet', 'Sheet1', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5'] for count in range(5): wb.create_sheet(0) actual_names = wb.get_sheet_names() assert sorted(actual_names) == sorted(names)
def test_get_sheet_names(): wb = Workbook() names = ["Sheet", "Sheet1", "Sheet2", "Sheet3", "Sheet4", "Sheet5"] for count in range(5): wb.create_sheet(0) actual_names = wb.get_sheet_names() eq_(sorted(actual_names), sorted(names))
def createReport(conn,series,dateList): #新建一个workbook wb = Workbook() #默认sheet"报表说明" sheet = wb.worksheets[0] sheet.title = u'报表说明' #生成"报表说明"内容 createReportSpecification(sheet, conn, series, dateList) #创建sheet"网站经销商数量变化分析" sheet = wb.create_sheet(u'网站经销商数量变化分析', 1) #生成"网站经销商数量变化分析"内容 createAnalysisOfWebsiteDealer(sheet, conn, series, dateList) #创建sheet"网站报价均值分析" sheet = wb.create_sheet(u'网站报价均值分析', 2) #生成"网站报价均值分析"内容 createAnalysisOfNetworkOffer(sheet, conn, series, dateList) #创建sheet"大区报价分析" sheet = wb.create_sheet(u'大区报价分析', 3) #生成"大区报价分析"内容 createAnalysisOfAreaPrice(sheet, conn, series, dateList) #创建sheet"省份报价分析" sheet = wb.create_sheet(u'省份报价分析', 4) #生成"省份报价分析"内容 createTableAnalysisOfProvincesOffer(sheet, conn, series, dateList) #创建sheet"报价详细" sheet = wb.create_sheet(u'报价详细', 5) #生成"报价详细"内容 createTableAnalysisOfDetailedQuotation(sheet, conn, series, dateList) file_dir = settings.filePath+'\\report\\'+series.encode('gbk')+'-报价日报_'.encode('gbk')+\ dateList[len(dateList)-1].strftime('%Y-%m-%d')+'.xlsx' #保存文件 wb.save(file_dir)
def test_write_content_types(): wb = Workbook() wb.create_sheet() wb.create_sheet() content = write_content_types(wb) reference_file = os.path.join(DATADIR, 'writer', 'expected', '[Content_Types].xml') assert_equals_file_content(reference_file, content)
def test_write_properties_app(self): wb = Workbook() wb.create_sheet() wb.create_sheet() content = write_properties_app(wb) assert_equals_file_content( os.path.join(DATADIR, 'writer', 'expected', 'app.xml'), content)
def FacebookProfiles(path=None , analysisid=None, delim=',', quote ='"',num_to_str=False) : IDCOL=0 sheet_index = 0 if path[len(path)-1] != '/' : path+='/' profile_file_obj='ua_fb_user.csv' comment_net_files = get_comment_net_csvs(path) id_user_map=user_map(path+profile_file_obj) #create workbook workbook = Workbook() #add profiles sheet sheet = workbook.create_sheet(sheet_index) sheet_index+=1 sheet.title="All_Users" for row_index, row in enumerate( csv.reader(open(path+profile_file_obj), delimiter=delim, quotechar = quote)): for col_index, col in enumerate(row): if col_index == IDCOL : col = str(col) + "_" sheet.cell(row = row_index, column = col_index).value = col #add network metrics sheet(s) for netfile in comment_net_files : uid=-1 lastind=0 sheet = workbook.create_sheet(sheet_index) sheet_index+=1 #verify function call on sheet object sheet.title = netfile.replace(".csv","").replace("fb_comment_net","").strip("_") reader=csv.reader(open(path+netfile), delimiter=delim, quotechar = quote) print 'writing network stats...' for row_index, row in enumerate( reader): if row_index == 0 : sheet.cell(row = row_index, column = 0 ).value = 'node_id' sheet.cell(row = row_index, column = 1 ).value = 'name' for col_index in range(1,len(row)) : sheet.cell(row = row_index, column = col_index + 1 ).value = row[col_index] sheet.cell(row = row_index, column = len(row) + 1).value = 'link' else : for col_index, col in enumerate(row): if col_index == IDCOL : col_ = col + "_" # (for print out) uid = col sheet.cell(row = row_index, column = col_index ).value = col_ sheet.cell(row = row_index, column = col_index + 1 ).value = id_user_map[col] else : sheet.cell(row = row_index, column = col_index + 1).value = col lastind = col_index + 1 sheet.cell(row = row_index, column = lastind + 1 ).value = "http://facebook.com/" + uid #save workbook print 'writing output file ' + path + "fb_comment_network_stats" + "_" + analysisid + ".xlsx" workbook.save(open(path + ("fb_comment_network_stats" + "_" + analysisid ).upper()+ ".xlsx",'w') )
def test_write_content_types(datadir): datadir.chdir() wb = Workbook() wb.create_sheet() wb.create_sheet() content = write_content_types(wb) with open('[Content_Types].xml') as expected: diff = compare_xml(content, expected.read()) assert diff is None, diff
def test_write_properties_app(datadir): datadir.join("writer").chdir() wb = Workbook() wb.create_sheet() wb.create_sheet() content = write_properties_app(wb) with open('app.xml') as expected: diff = compare_xml(content, expected.read()) assert diff is None, diff
def test_write_properties_app(self): wb = Workbook() wb.create_sheet() wb.create_sheet() content = write_properties_app(wb) reference_file = os.path.join(DATADIR, "writer", "expected", "app.xml") with open(reference_file) as expected: diff = compare_xml(content, expected.read()) assert diff is None
def test_write_content_types(): wb = Workbook() wb.create_sheet() wb.create_sheet() content = write_content_types(wb) reference_file = os.path.join(DATADIR, 'writer', 'expected', '[Content_Types].xml') with open(reference_file) as expected: diff = compare_xml(content, expected.read()) assert diff is None, diff
def write_DataFrame(dfs, folder, filename, meta_info = [], missing = None): #dfs = list[info] #info = {'df': df, 'sheetname': sheetname, 'cols' = []}, sheet and cols optional writer = Workbook(optimized_write = True) if len(meta_info) > 0: ws = writer.create_sheet(title = 'read me') for row in meta_info: for i in range(len(row)): row[i] = str(row[i]) ws.append(row) for info in dfs: if 'sheetname' not in info: ws = writer.create_sheet(title = 'Sheet' + str(len(writer.worksheets)+1)) else: ws = writer.create_sheet(title = info['sheetname']) if 'cols' not in info: cols = list(info['df'].axes[1]) else: cols = info['cols'] for i in range(len(cols)): cols[i] = str(cols[i]) cols_copy = deepcopy(cols) for i in range(len(cols)): if type(cols[i]) in [str, unicode]: cols_copy[i] = killgremlins(cols[i]) ws.append(cols_copy) try: info['df'] = info['df'].reset_index() except: pass #info['df'] = info['df'].reset_index() for i in range(len(info['df'])): row = list(info['df'].iloc[i][cols]) for j in range(len(row)): try: row[j] = killgremlins(row[j]) except: pass if type(row[j]) in [float, int64, float64] and isnan(row[j]): row[j] = missing if type(row[j]) in [str, unicode]: row[j] = killgremlins(row[j]) elif type(row[j]) == bool_: if row[j]: row[j] = 'TRUE' else: row[j] = 'FALSE' elif type(row[j]) == int64: row[j] = float(row[j]) elif type(row[j]) in [tuple, list]: row[j] = str(row[j]) ws.append(row) ws.freeze_panes = 'A2' writer.save(os.path.join(folder, filename))
def test_write_hidden_worksheet(): wb = Workbook() ws = wb.create_sheet() ws.sheet_state = ws.SHEETSTATE_HIDDEN ws.cell("F42").value = "hello" content = write_worksheet(ws, {"hello": 0}, {}) assert_equals_file_content(os.path.join(DATADIR, "writer", "expected", "sheet1.xml"), content)
def test_write_height(): wb = Workbook() ws = wb.create_sheet() ws.cell("F1").value = 10 ws.row_dimensions[ws.cell("F1").row].height = 30 content = write_worksheet(ws, {}, {}) assert_equals_file_content(os.path.join(DATADIR, "writer", "expected", "sheet1_height.xml"), content)
def test_hyperlink_value(): wb = Workbook() ws = wb.create_sheet() ws.cell("A1").hyperlink = "http://test.com" eq_("http://test.com", ws.cell("A1").value) ws.cell("A1").value = "test" eq_("test", ws.cell("A1").value)
def test_write_hyperlink(): wb = Workbook() ws = wb.create_sheet() ws.cell("A1").value = "test" ws.cell("A1").hyperlink = "http://test.com" content = write_worksheet(ws, {"test": 0}, {}) assert_equals_file_content(os.path.join(DATADIR, "writer", "expected", "sheet1_hyperlink.xml"), content)
def test_write_worksheet(): wb = Workbook() ws = wb.create_sheet() ws.cell('F42').value = 'hello' content = write_worksheet(ws, {'hello': 0}, {}) assert_equals_file_content(os.path.join(DATADIR, 'writer', 'expected', \ 'sheet1.xml'), content)
def test_write_style(): wb = Workbook() ws = wb.create_sheet() ws.cell("F1").value = "13%" style_id_by_hash = StyleWriter(wb).get_style_by_hash() content = write_worksheet(ws, {}, style_id_by_hash) assert_equals_file_content(os.path.join(DATADIR, "writer", "expected", "sheet1_style.xml"), content)
def test_decimal(): wb = Workbook() ws = wb.create_sheet() ws.cell('A1').value = decimal.Decimal('3.14') content = write_worksheet(ws, {}, {}) assert_equals_file_content(os.path.join(DATADIR, 'writer', 'expected', \ 'decimal.xml'), content)
def test_short_number(): wb = Workbook() ws = wb.create_sheet() ws.cell('A1').value = 1234567890 content = write_worksheet(ws, {}, {}) assert_equals_file_content(os.path.join(DATADIR, 'writer', 'expected', \ 'short_number.xml'), content)
def test_get_sheet_by_name(): wb = Workbook() new_sheet = wb.create_sheet() title = "my sheet" new_sheet.title = title found_sheet = wb.get_sheet_by_name(title) eq_(new_sheet, found_sheet)
def test_add_named_range(): wb = Workbook() new_sheet = wb.create_sheet() named_range = NamedRange("test_nr", [(new_sheet, "A1")]) wb.add_named_range(named_range) named_ranges_list = wb.get_named_ranges() assert named_range in named_ranges_list
def test_freeze_panes_both(): wb = Workbook() ws = wb.create_sheet() ws.cell("F42").value = "hello" ws.freeze_panes = "D4" content = write_worksheet(ws, {"hello": 0}, {}) assert_equals_file_content(os.path.join(DATADIR, "writer", "expected", "sheet1_freeze_panes_both.xml"), content)
def test_get_named_range(): wb = Workbook() new_sheet = wb.create_sheet() named_range = NamedRange("test_nr", [(new_sheet, "A1")]) wb.add_named_range(named_range) found_named_range = wb.get_named_range("test_nr") eq_(named_range, found_named_range)
def test_hyperlink_value(): wb = Workbook() ws = wb.create_sheet() ws.cell('A1').hyperlink = "http://test.com" assert "http://test.com" == ws.cell('A1').value ws.cell('A1').value = "test" assert "test" == ws.cell('A1').value
def test_get_named_range(): wb = Workbook() new_sheet = wb.create_sheet() named_range = NamedRange('test_nr', [(new_sheet, 'A1')]) wb.add_named_range(named_range) found_named_range = wb.get_named_range('test_nr') assert named_range == found_named_range
class XLSXRenderer(Renderer): def __init__(self, renderers_func, config): if not has_openpyxl: debug.error("You must install OpenPyxl 2.1.2 for xlsx format:\n\thttps://pypi.python.org/pypi/openpyxl") self._config = config self._columns = None self._text_cell_renderers_func = renderers_func self._text_cell_renderers = None self._wb = Workbook(optimized_write=True) self._ws = self._wb.create_sheet() def description(self): output = [] for column in self._columns: output.append((column.name)) return output def _add_row(self, node, data): accumulator = data accumulator[node] = max(accumulator.values()) + 1 self._ws.append(list(node.values)) return accumulator def render(self, outfd, grid): """Renders the TreeGrid in data out to the output file from the config options""" if not self._config.OUTPUT_FILE: debug.error("Please specify a valid output file using --output-file") self._columns = grid.columns self._text_cell_renderers = self._text_cell_renderers_func(self._columns) self._ws.append(self.description()) grid.visit(None, self._add_row, {None: 0}) self._wb.save(filename=self._config.OUTPUT_FILE)
def test_write_bool(): wb = Workbook() ws = wb.create_sheet() ws.cell("F42").value = False ws.cell("F43").value = True content = write_worksheet(ws, {}, {}) assert_equals_file_content(os.path.join(DATADIR, "writer", "expected", "sheet1_bool.xml"), content)
def test_get_sheet_by_name(): wb = Workbook() new_sheet = wb.create_sheet() title = 'my sheet' new_sheet.title = title found_sheet = wb.get_sheet_by_name(title) assert new_sheet == found_sheet
def test_create_sheet_with_name(): wb = Workbook() new_sheet = wb.create_sheet(0, title='LikeThisName') eq_(new_sheet, wb.worksheets[0])
def test_add_correct_sheet(): wb = Workbook() new_sheet = wb.create_sheet(0) wb.add_sheet(new_sheet) eq_(new_sheet, wb.worksheets[2])
def test_create_sheet_readonly(): wb = Workbook() wb._set_optimized_read() wb.create_sheet()
#encoding=utf-8 import os import json from openpyxl.workbook import Workbook List = os.listdir(os.getcwd()) NewList = filter(lambda x: "." not in x, List) wb = Workbook() sheet = wb.create_sheet("Summary") sheet = wb["Sheet"] wb.remove(sheet) SumTitleLine = ["No", "Name", "Status", "Owner"] sheet = wb["Summary"] for index, rowValue in enumerate(["SumTitleLine"] + NewList): if index == 0: sheet.append(SumTitleLine) else: rowContent = [str(index), rowValue] sheet.append(rowContent) sheet.column_dimensions['B'].width = 40 wb.save("OWASP_Scan_Staus.xlsx")
def test_remove_sheet(): wb = Workbook() new_sheet = wb.create_sheet(0) wb.remove(new_sheet) assert new_sheet not in wb.worksheets
def test_create_sheet(): wb = Workbook() new_sheet = wb.create_sheet(0) eq_(new_sheet, wb.worksheets[0])
def dump_workbook(): wb = Workbook(optimized_write=True) ws = wb.create_sheet() ws.append(range(30)) wb.save(filename=BytesIO())
""" import argparse import csv import sys from openpyxl.workbook import Workbook parser = argparse.ArgumentParser() parser.add_argument("input_file") parser.add_argument("-d", "--delimiter", default="\t", help="select delimiter character") args = parser.parse_args() if not ".csv" in args.input_file: sys.stderr.write("Error: File does not have the ending \".csv\".\n") sys.exit(2) input_fh = open(args.input_file) workbook = Workbook() sheet = workbook.create_sheet(0) for row_index, row in enumerate( csv.reader(open(args.input_file), delimiter=args.delimiter)): for col_index, col in enumerate(row): sheet.cell(row=row_index, column=col_index).value = col workbook.save(open(args.input_file.replace(".csv", ".xlsx"), "w"))
def test_get_index(): wb = Workbook() new_sheet = wb.create_sheet() sheet_index = wb.get_index(new_sheet) assert sheet_index == 1
def test_get_index(): wb = Workbook() new_sheet = wb.create_sheet(0)
class TestStyleWriter(object): def setUp(self): self.workbook = Workbook() self.worksheet = self.workbook.create_sheet() def test_no_style(self): w = StyleWriter(self.workbook) eq_(0, len(w.style_table)) def test_nb_style(self): for i in range(1, 6): self.worksheet.cell(row=1, column=i).style.font.size += i w = StyleWriter(self.workbook) eq_(5, len(w.style_table)) self.worksheet.cell('A10').style.borders.top = Border.BORDER_THIN w = StyleWriter(self.workbook) eq_(6, len(w.style_table)) def test_style_unicity(self): for i in range(1, 6): self.worksheet.cell(row=1, column=i).style.font.bold = True w = StyleWriter(self.workbook) eq_(1, len(w.style_table)) def test_fonts(self): self.worksheet.cell('A1').style.font.size = 12 self.worksheet.cell('A1').style.font.bold = True w = StyleWriter(self.workbook) w._write_fonts() eq_( get_xml(w._root), '<?xml version=\'1.0\' encoding=\'UTF-8\'?><styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><fonts count="2"><font><sz val="11" /><color theme="1" /><name val="Calibri" /><family val="2" /><scheme val="minor" /></font><font><sz val="12" /><color rgb="FF000000" /><name val="Calibri" /><family val="2" /><b /></font></fonts></styleSheet>' ) def test_fonts_with_underline(self): self.worksheet.cell('A1').style.font.size = 12 self.worksheet.cell('A1').style.font.bold = True self.worksheet.cell('A1').style.font.underline = Font.UNDERLINE_SINGLE w = StyleWriter(self.workbook) w._write_fonts() eq_( get_xml(w._root), '<?xml version=\'1.0\' encoding=\'UTF-8\'?><styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><fonts count="2"><font><sz val="11" /><color theme="1" /><name val="Calibri" /><family val="2" /><scheme val="minor" /></font><font><sz val="12" /><color rgb="FF000000" /><name val="Calibri" /><family val="2" /><b /><u /></font></fonts></styleSheet>' ) def test_fills(self): self.worksheet.cell('A1').style.fill.fill_type = 'solid' self.worksheet.cell( 'A1').style.fill.start_color.index = Color.DARKYELLOW w = StyleWriter(self.workbook) w._write_fills() eq_( get_xml(w._root), '<?xml version=\'1.0\' encoding=\'UTF-8\'?><styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><fills count="3"><fill><patternFill patternType="none" /></fill><fill><patternFill patternType="gray125" /></fill><fill><patternFill patternType="solid"><fgColor rgb="FF808000" /></patternFill></fill></fills></styleSheet>' ) def test_borders(self): self.worksheet.cell( 'A1').style.borders.top.border_style = Border.BORDER_THIN self.worksheet.cell( 'A1').style.borders.top.color.index = Color.DARKYELLOW w = StyleWriter(self.workbook) w._write_borders() eq_( get_xml(w._root), '<?xml version=\'1.0\' encoding=\'UTF-8\'?><styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><borders count="2"><border><left /><right /><top /><bottom /><diagonal /></border><border><left /><right /><top style="thin"><color rgb="FF808000" /></top><bottom /><diagonal /></border></borders></styleSheet>' ) def test_write_cell_xfs_1(self): self.worksheet.cell('A1').style.font.size = 12 w = StyleWriter(self.workbook) ft = w._write_fonts() nft = w._write_number_formats() w._write_cell_xfs(nft, ft, {}, {}) xml = get_xml(w._root) ok_('applyFont="1"' in xml) ok_('applyFill="1"' not in xml) ok_('applyBorder="1"' not in xml) ok_('applyAlignment="1"' not in xml) def test_alignment(self): self.worksheet.cell('A1').style.alignment.horizontal = 'center' self.worksheet.cell('A1').style.alignment.vertical = 'center' w = StyleWriter(self.workbook) nft = w._write_number_formats() w._write_cell_xfs(nft, {}, {}, {}) xml = get_xml(w._root) ok_('applyAlignment="1"' in xml) ok_('horizontal="center"' in xml) ok_('vertical="center"' in xml) def test_alignment_rotation(self): self.worksheet.cell('A1').style.alignment.vertical = 'center' self.worksheet.cell('A1').style.alignment.text_rotation = 90 self.worksheet.cell('A2').style.alignment.vertical = 'center' self.worksheet.cell('A2').style.alignment.text_rotation = 135 self.worksheet.cell('A3').style.alignment.text_rotation = -34 w = StyleWriter(self.workbook) nft = w._write_number_formats() w._write_cell_xfs(nft, {}, {}, {}) xml = get_xml(w._root) ok_('textRotation="90"' in xml) ok_('textRotation="135"' in xml) ok_('textRotation="124"' in xml) def test_alignment_indent(self): self.worksheet.cell('A1').style.alignment.indent = 1 self.worksheet.cell('A2').style.alignment.indent = 4 self.worksheet.cell('A3').style.alignment.indent = 0 self.worksheet.cell('A3').style.alignment.indent = -1 w = StyleWriter(self.workbook) nft = w._write_number_formats() w._write_cell_xfs(nft, {}, {}, {}) xml = get_xml(w._root) ok_('indent="1"' in xml) ok_('indent="4"' in xml) #Indents not greater than zero are ignored when writing ok_('indent="0"' not in xml) ok_('indent="-1"' not in xml)
def test_add_correct_sheet(): wb = Workbook() new_sheet = wb.create_sheet() wb._add_sheet(new_sheet) assert new_sheet == wb.worksheets[2]
def test_create_sheet(): wb = Workbook() new_sheet = wb.create_sheet() assert new_sheet == wb.worksheets[-1]
def test_create_sheet_readonly(): wb = Workbook() wb._read_only = True with pytest.raises(ReadOnlyWorkbookException): wb.create_sheet()
def test_get_index(): wb = Workbook() new_sheet = wb.create_sheet(0) sheet_index = wb.get_index(new_sheet) eq_(sheet_index, 0)
def test_create_sheet_with_name(): wb = Workbook() new_sheet = wb.create_sheet(title='LikeThisName') assert new_sheet == wb.worksheets[-1]
def test_get_named_range(): wb = Workbook() new_sheet = wb.create_sheet() wb.create_named_range('test_nr', new_sheet, 'A1') assert wb.defined_names['test_nr'].value == 'Sheet1!A1'
def test_cannot_copy_writeonly(self): wb = Workbook(write_only=True) ws = wb.create_sheet() with pytest.raises(ValueError): wb.copy_worksheet(ws)
class _OpenpyxlWriter(ExcelWriter): engine = "openpyxl" supported_extensions = (".xlsx", ".xlsm") def __init__(self, path, engine=None, mode="w", **engine_kwargs): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook super().__init__(path, mode=mode, **engine_kwargs) if self.mode == "a": # Load from existing workbook from openpyxl import load_workbook book = load_workbook(self.path) self.book = book else: # Create workbook object with default optimized_write=True. self.book = Workbook() if self.book.worksheets: try: self.book.remove(self.book.worksheets[0]) except AttributeError: # compat - for openpyxl <= 2.4 self.book.remove_sheet(self.book.worksheets[0]) def save(self): """ Save workbook to disk. """ return self.book.save(self.path) @classmethod def _convert_to_style(cls, style_dict): """ Converts a style_dict to an openpyxl style object. Parameters ---------- style_dict : style dictionary to convert """ from openpyxl.style import Style xls_style = Style() for key, value in style_dict.items(): for nk, nv in value.items(): if key == "borders": (xls_style.borders.__getattribute__(nk).__setattr__( "border_style", nv)) else: xls_style.__getattribute__(key).__setattr__(nk, nv) return xls_style @classmethod def _convert_to_style_kwargs(cls, style_dict): """ Convert a style_dict to a set of kwargs suitable for initializing or updating-on-copy an openpyxl v2 style object. Parameters ---------- style_dict : dict A dict with zero or more of the following keys (or their synonyms). 'font' 'fill' 'border' ('borders') 'alignment' 'number_format' 'protection' Returns ------- style_kwargs : dict A dict with the same, normalized keys as ``style_dict`` but each value has been replaced with a native openpyxl style object of the appropriate class. """ _style_key_map = {"borders": "border"} style_kwargs = {} for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v return style_kwargs @classmethod def _convert_to_color(cls, color_spec): """ Convert ``color_spec`` to an openpyxl v2 Color object. Parameters ---------- color_spec : str, dict A 32-bit ARGB hex string, or a dict with zero or more of the following keys. 'rgb' 'indexed' 'auto' 'theme' 'tint' 'index' 'type' Returns ------- color : openpyxl.styles.Color """ from openpyxl.styles import Color if isinstance(color_spec, str): return Color(color_spec) else: return Color(**color_spec) @classmethod def _convert_to_font(cls, font_dict): """ Convert ``font_dict`` to an openpyxl v2 Font object. Parameters ---------- font_dict : dict A dict with zero or more of the following keys (or their synonyms). 'name' 'size' ('sz') 'bold' ('b') 'italic' ('i') 'underline' ('u') 'strikethrough' ('strike') 'color' 'vertAlign' ('vertalign') 'charset' 'scheme' 'family' 'outline' 'shadow' 'condense' Returns ------- font : openpyxl.styles.Font """ from openpyxl.styles import Font _font_key_map = { "sz": "size", "b": "bold", "i": "italic", "u": "underline", "strike": "strikethrough", "vertalign": "vertAlign", } font_kwargs = {} for k, v in font_dict.items(): if k in _font_key_map: k = _font_key_map[k] if k == "color": v = cls._convert_to_color(v) font_kwargs[k] = v return Font(**font_kwargs) @classmethod def _convert_to_stop(cls, stop_seq): """ Convert ``stop_seq`` to a list of openpyxl v2 Color objects, suitable for initializing the ``GradientFill`` ``stop`` parameter. Parameters ---------- stop_seq : iterable An iterable that yields objects suitable for consumption by ``_convert_to_color``. Returns ------- stop : list of openpyxl.styles.Color """ return map(cls._convert_to_color, stop_seq) @classmethod def _convert_to_fill(cls, fill_dict): """ Convert ``fill_dict`` to an openpyxl v2 Fill object. Parameters ---------- fill_dict : dict A dict with one or more of the following keys (or their synonyms), 'fill_type' ('patternType', 'patterntype') 'start_color' ('fgColor', 'fgcolor') 'end_color' ('bgColor', 'bgcolor') or one or more of the following keys (or their synonyms). 'type' ('fill_type') 'degree' 'left' 'right' 'top' 'bottom' 'stop' Returns ------- fill : openpyxl.styles.Fill """ from openpyxl.styles import GradientFill, PatternFill _pattern_fill_key_map = { "patternType": "fill_type", "patterntype": "fill_type", "fgColor": "start_color", "fgcolor": "start_color", "bgColor": "end_color", "bgcolor": "end_color", } _gradient_fill_key_map = {"fill_type": "type"} pfill_kwargs = {} gfill_kwargs = {} for k, v in fill_dict.items(): pk = gk = None if k in _pattern_fill_key_map: pk = _pattern_fill_key_map[k] if k in _gradient_fill_key_map: gk = _gradient_fill_key_map[k] if pk in ["start_color", "end_color"]: v = cls._convert_to_color(v) if gk == "stop": v = cls._convert_to_stop(v) if pk: pfill_kwargs[pk] = v elif gk: gfill_kwargs[gk] = v else: pfill_kwargs[k] = v gfill_kwargs[k] = v try: return PatternFill(**pfill_kwargs) except TypeError: return GradientFill(**gfill_kwargs) @classmethod def _convert_to_side(cls, side_spec): """ Convert ``side_spec`` to an openpyxl v2 Side object. Parameters ---------- side_spec : str, dict A string specifying the border style, or a dict with zero or more of the following keys (or their synonyms). 'style' ('border_style') 'color' Returns ------- side : openpyxl.styles.Side """ from openpyxl.styles import Side _side_key_map = {"border_style": "style"} if isinstance(side_spec, str): return Side(style=side_spec) side_kwargs = {} for k, v in side_spec.items(): if k in _side_key_map: k = _side_key_map[k] if k == "color": v = cls._convert_to_color(v) side_kwargs[k] = v return Side(**side_kwargs) @classmethod def _convert_to_border(cls, border_dict): """ Convert ``border_dict`` to an openpyxl v2 Border object. Parameters ---------- border_dict : dict A dict with zero or more of the following keys (or their synonyms). 'left' 'right' 'top' 'bottom' 'diagonal' 'diagonal_direction' 'vertical' 'horizontal' 'diagonalUp' ('diagonalup') 'diagonalDown' ('diagonaldown') 'outline' Returns ------- border : openpyxl.styles.Border """ from openpyxl.styles import Border _border_key_map = { "diagonalup": "diagonalUp", "diagonaldown": "diagonalDown" } border_kwargs = {} for k, v in border_dict.items(): if k in _border_key_map: k = _border_key_map[k] if k == "color": v = cls._convert_to_color(v) if k in ["left", "right", "top", "bottom", "diagonal"]: v = cls._convert_to_side(v) border_kwargs[k] = v return Border(**border_kwargs) @classmethod def _convert_to_alignment(cls, alignment_dict): """ Convert ``alignment_dict`` to an openpyxl v2 Alignment object. Parameters ---------- alignment_dict : dict A dict with zero or more of the following keys (or their synonyms). 'horizontal' 'vertical' 'text_rotation' 'wrap_text' 'shrink_to_fit' 'indent' Returns ------- alignment : openpyxl.styles.Alignment """ from openpyxl.styles import Alignment return Alignment(**alignment_dict) @classmethod def _convert_to_number_format(cls, number_format_dict): """ Convert ``number_format_dict`` to an openpyxl v2.1.0 number format initializer. Parameters ---------- number_format_dict : dict A dict with zero or more of the following keys. 'format_code' : str Returns ------- number_format : str """ return number_format_dict["format_code"] @classmethod def _convert_to_protection(cls, protection_dict): """ Convert ``protection_dict`` to an openpyxl v2 Protection object. Parameters ---------- protection_dict : dict A dict with zero or more of the following keys. 'locked' 'hidden' Returns ------- """ from openpyxl.styles import Protection return Protection(**protection_dict) def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None): # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) _style_cache = {} if sheet_name in self.sheets: wks = self.sheets[sheet_name] else: wks = self.book.create_sheet() wks.title = sheet_name self.sheets[sheet_name] = wks if _validate_freeze_panes(freeze_panes): wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1, column=freeze_panes[1] + 1) for cell in cells: xcell = wks.cell(row=startrow + cell.row + 1, column=startcol + cell.col + 1) xcell.value, fmt = self._value_with_fmt(cell.val) if fmt: xcell.number_format = fmt style_kwargs = {} if cell.style: key = str(cell.style) style_kwargs = _style_cache.get(key) if style_kwargs is None: style_kwargs = self._convert_to_style_kwargs(cell.style) _style_cache[key] = style_kwargs if style_kwargs: for k, v in style_kwargs.items(): setattr(xcell, k, v) if cell.mergestart is not None and cell.mergeend is not None: wks.merge_cells( start_row=startrow + cell.row + 1, start_column=startcol + cell.col + 1, end_column=startcol + cell.mergeend + 1, end_row=startrow + cell.mergestart + 1, ) # When cells are merged only the top-left cell is preserved # The behaviour of the other cells in a merged range is # undefined if style_kwargs: first_row = startrow + cell.row + 1 last_row = startrow + cell.mergestart + 1 first_col = startcol + cell.col + 1 last_col = startcol + cell.mergeend + 1 for row in range(first_row, last_row + 1): for col in range(first_col, last_col + 1): if row == first_row and col == first_col: # Ignore first cell. It is already handled. continue xcell = wks.cell(column=col, row=row) for k, v in style_kwargs.items(): setattr(xcell, k, v)
def test_get_sheet_names(): wb = Workbook() names = ['Sheet', 'Sheet1', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5'] for count in range(5): wb.create_sheet(0) assert wb.sheetnames == names
for rownum in random.sample(range(1, 100), num): for columnnum in range(1, sheet.max_column + 1): data_list.append( sheet.cell(row=rownum, column=columnnum).value) for i in data_list: print(i) a = [] for i in range(len(data_list)): if i % 2 == 0: a.append(data_list[i]) for i in a: print(i) wbl = Workbook() new_ws = wbl.create_sheet(title='test') key_ws = wbl.create_sheet(title='test_key') i = 1 k = 1 for data_1 in range(len(data_list)): key_ws.cell(column=k, row=i, value=data_list[data_l]) if k % 2 == 1: k = 2 else: k = 1 i += 1 i = 1 for data_m in range(len(a)): new_ws.cell(column=1, row=i, value=a[data_m]) i += 1
class CheKeShengXian: '''车客生鲜爬虫''' def __init__(self): self.proxies = {} self.ipf = IpFilter('1') self.dict = [] self.dict = UserAgent.MY_USER_AGENT self.outwb = Workbook() self.BaseUrl = "https://api.jingchengcaidian.com/api/Category/getCategoryName?" self.headers = { 'charset': 'utf-8', 'Accept-Encoding': 'gzip', 'referer': 'https://servicewechat.com/wx2911548a18ed7d95/42/page-frame.html', 'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; FRD-AL10 Build/HUAWEIFRD-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/68.0.3440.91 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070337) NetType/WIFI Language/zh_CN Process/appbrand2', 'Host': 'api.jingchengcaidian.com', 'Connection': 'Keep-Alive' } self.PidIndex = { '7': '蔬菜', '5': '水果', '4': '水产', '8': '禽蛋', '1': '肉类', '9': '粮油', '2': '乳饮', '10': '零食', '3': '速食', '6': '生活', '23': '扶贫' } self.pageCount = [1, 2, 3, 4] def initialization(self): self.headers['User-Agent'] = random.sample(self.dict, 1)[0] #self.proxies = self.ipf.getIp() self.proxies = {'HTTP': 'localhost:8080'} #s设置请求头和ip def getUrlAndSheet(self): #url工厂 returnData = {} for pid, value in self.PidIndex.items(): urlList = [] for page in self.pageCount: url = self.BaseUrl + 'pid=' + pid + '&page=' + str(page) urlList.append(url) returnData[pid] = urlList return returnData def getExcel(self, careerName, outwb): wo = self.outwb.active careerSheet = self.outwb.create_sheet(unicode(careerName), 0) careerSheet.append([ 'id', '库存amount', '下单数buynum', '团购价格group_price', '划线价market_price', '名称name', '特色标签label', '描述samllpromotion' ]) return careerSheet def SaveExcel(self, ExcelName): self.outwb.save(ExcelName + '.xlsx') def getData(self): urlDict = self.getUrlAndSheet() for key, values in urlDict.items(): careerSheet = self.getExcel(self.PidIndex[key], self.outwb) for url in values: bool = True while bool: #如果代理ip和请求头未能获得数据,则更换信息,继续请求 try: self.initialization() req = requests.get(url=url, headers=self.headers).json() data1 = json.dumps(req, ensure_ascii=False) data2 = json.loads(data1) if data2['message'] == '列表获取成功': for item in data2['data']: careerSheet.append([ item['id'], item['amount'], item['buynum'], item['group_price'], item['market_price'], item['name'], item['lable'], item['smallpromotion'] ]) bool = False print data2['message'] except: print '爬虫失败,继续尝试' def Crawl(self, ExcelName): self.getData() self.SaveExcel(ExcelName)
def to_xls_export(self, path, data, *args): def write_row(data, work_sheet, fields, work_sheet_titles): # update parent_table with the generated sheet's title data[PARENT_TABLE_NAME] = work_sheet_titles.get( data.get(PARENT_TABLE_NAME)) work_sheet.append([data.get(f) for f in fields]) wb = Workbook(optimized_write=True) work_sheets = {} # map of section_names to generated_names work_sheet_titles = {} for section in self.sections: section_name = section['name'] work_sheet_title = ExportBuilder.get_valid_sheet_name( "_".join(section_name.split("/")), work_sheet_titles.values()) work_sheet_titles[section_name] = work_sheet_title work_sheets[section_name] = wb.create_sheet(title=work_sheet_title) # write the headers for section in self.sections: section_name = section['name'] headers = [element['title'] for element in section['elements']] + self.EXTRA_FIELDS # get the worksheet ws = work_sheets[section_name] ws.append(headers) index = 1 indices = {} survey_name = self.survey.name for d in data: joined_export = dict_to_joined_export(d, index, indices, survey_name) output = ExportBuilder.decode_mongo_encoded_section_names( joined_export) # attach meta fields (index, parent_index, parent_table) # output has keys for every section if survey_name not in output: output[survey_name] = {} output[survey_name][INDEX] = index output[survey_name][PARENT_INDEX] = -1 for section in self.sections: # get data for this section and write to xls section_name = section['name'] fields = [element['xpath'] for element in section['elements'] ] + self.EXTRA_FIELDS ws = work_sheets[section_name] # section might not exist within the output, e.g. data was # not provided for said repeat - write test to check this row = output.get(section_name, None) if type(row) == dict: write_row(self.pre_process_row(row, section), ws, fields, work_sheet_titles) elif type(row) == list: for child_row in row: write_row(self.pre_process_row(child_row, section), ws, fields, work_sheet_titles) index += 1 wb.save(filename=path)
def to_xls_export(self, path, data, *args, **kwargs): def write_row(data, work_sheet, fields, work_sheet_titles): # update parent_table with the generated sheet's title data[PARENT_TABLE_NAME] = work_sheet_titles.get( data.get(PARENT_TABLE_NAME)) work_sheet.append([data.get(f) for f in fields]) dataview = kwargs.get('dataview') total_records = kwargs.get('total_records') wb = Workbook(write_only=True) work_sheets = {} # map of section_names to generated_names work_sheet_titles = {} for section in self.sections: section_name = section['name'] work_sheet_title = ExportBuilder.get_valid_sheet_name( '_'.join(section_name.split('/')), work_sheet_titles.values()) work_sheet_titles[section_name] = work_sheet_title work_sheets[section_name] = wb.create_sheet( title=work_sheet_title) # write the headers if not self.INCLUDE_LABELS_ONLY: for section in self.sections: section_name = section['name'] headers = self.get_fields(dataview, section, 'title') # get the worksheet ws = work_sheets[section_name] ws.append(headers) # write labels if self.INCLUDE_LABELS or self.INCLUDE_LABELS_ONLY: for section in self.sections: section_name = section['name'] labels = self.get_fields(dataview, section, 'label') # get the worksheet ws = work_sheets[section_name] ws.append(labels) media_xpaths = [] if not self.INCLUDE_IMAGES \ else self.dd.get_media_survey_xpaths() # write hxl header columns_with_hxl = kwargs.get('columns_with_hxl') if self.INCLUDE_HXL and columns_with_hxl: for section in self.sections: section_name = section['name'] headers = self.get_fields(dataview, section, 'title') # get the worksheet ws = work_sheets[section_name] hxl_row = [columns_with_hxl.get(col, '') for col in headers] hxl_row and ws.append(hxl_row) index = 1 indices = {} survey_name = self.survey.name for i, d in enumerate(data, start=1): joined_export = dict_to_joined_export(d, index, indices, survey_name, self.survey, d, media_xpaths) output = decode_mongo_encoded_section_names(joined_export) # attach meta fields (index, parent_index, parent_table) # output has keys for every section if survey_name not in output: output[survey_name] = {} output[survey_name][INDEX] = index output[survey_name][PARENT_INDEX] = -1 for section in self.sections: # get data for this section and write to xls section_name = section['name'] fields = self.get_fields(dataview, section, 'xpath') ws = work_sheets[section_name] # section might not exist within the output, e.g. data was # not provided for said repeat - write test to check this row = output.get(section_name, None) if isinstance(row, dict): write_row( self.pre_process_row(row, section), ws, fields, work_sheet_titles) elif isinstance(row, list): for child_row in row: write_row( self.pre_process_row(child_row, section), ws, fields, work_sheet_titles) index += 1 track_task_progress(i, total_records) wb.save(filename=path)
def to_xls_export(self, path, data, username, id_string, *args): xform = XForm.objects.get(user__username__iexact=username, id_string__exact=id_string) json_question = json.loads(xform.json) parsedQuestions = get_questions_and_media_attributes( json_question['children']) from django.contrib.sites.models import Site as DjangoSite domain = DjangoSite.objects.get_current().domain def write_row(data, work_sheet, fields, work_sheet_titles): # work_sheet_titles = work_sheet_titles.append("fs_site") # update parent_table with the generated sheet's title data[PARENT_TABLE_NAME] = work_sheet_titles.get( data.get(PARENT_TABLE_NAME)) data_new = [] for f in fields: if f in data and f in parsedQuestions.get('media_attributes'): data_new.append('=HYPERLINK("http://' + domain + '/attachment/medium?media_file=' + xform.user.username + '/attachments/' + data.get(f) + '", "Attachment")') else: if f == "fs_status": try: status = FInstance.objects.get( instance_id=data.get( '_id')).get_form_status_display() except: status = "No Status" data_new.append(status) else: data_new.append(data.get(f, '')) work_sheet.append(data_new) wb = Workbook(optimized_write=True) work_sheets = {} # map of section_names to generated_names work_sheet_titles = {} for section in self.sections: section_name = section['name'] work_sheet_title = ExportBuilder.get_valid_sheet_name( "_".join(section_name.split("/")), work_sheet_titles.values()) work_sheet_titles[section_name] = work_sheet_title work_sheets[section_name] = wb.create_sheet(title=work_sheet_title) # write the headers for section in self.sections: section_name = section['name'] headers = [element['title'] for element in section['elements']] + self.EXTRA_FIELDS # get the worksheet ws = work_sheets[section_name] ws.append(headers) index = 1 indices = {} survey_name = self.survey.name for d in data: joined_export = dict_to_joined_export(d, index, indices, survey_name) output = ExportBuilder.decode_mongo_encoded_section_names( joined_export) # attach meta fields (index, parent_index, parent_table) # output has keys for every section if survey_name not in output: output[survey_name] = {} output[survey_name][INDEX] = index output[survey_name][PARENT_INDEX] = -1 for section in self.sections: # get data for this section and write to xls section_name = section['name'] fields = [element['xpath'] for element in section['elements'] ] + self.EXTRA_FIELDS ws = work_sheets[section_name] # section might not exist within the output, e.g. data was # not provided for said repeat - write test to check this row = output.get(section_name, None) if type(row) == dict: write_row(self.pre_process_row(row, section), ws, fields, work_sheet_titles) elif type(row) == list: for child_row in row: write_row(self.pre_process_row(child_row, section), ws, fields, work_sheet_titles) index += 1 wb.save(filename=path)
class SHSX: # 食行生鲜爬虫 def __init__(self): self.dict = [] self.dict = UserAgent.Android_USER_AGENT + UserAgent.iPhone_USER_AGENT #self.ipf = IpFilter('1') self.outwb = Workbook() #Excel对象 self.url_Class = { '蔬菜豆菇': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100840', '新鲜水果': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100559', '鲜肉蛋禽': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100771', '水产生鲜': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100591', '乳品烘培': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100705', '面点速食': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100736', '粮油副食': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100870', '休闲零食': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100935', '酒水饮料': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100996', '生活百货': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=100625', '鲜花绿植': 'https://api1.34580.com/sz/productcategory/firstdetail?sourcetype=9&id=101017' } self.headers = { 'charset': 'utf-8', 'Accept-Encoding': 'gzip', 'referer': 'https://servicewechat.com/wx6e7ce0c196b0c3c2/32/page-frame.html', 'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; FRD-AL10 Build/HUAWEIFRD-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/68.0.3440.91 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070338) NetType/WIFI Language/zh_CN Process/toolsmp', 'Host': 'api1.34580.com', 'Connection': 'Keep-Alive' } def getBaseUrl(self, str_ID): # 每次每页取最大数据 url_allClass = 'https://api1.34580.com/sz/ProductRequests/ProductMultiConditionRequest?sourcetype=9&OrderDirectionType=0&OrderFieldType=0&CategoryIds=%s&PageSize=%s&PageIndex=1&SourceType=9&MallTypes=&joinedproduct=true' % ( str_ID, '1000') return url_allClass def initialization(self): #请求初始化函数 设置useragent和代理ip self.headers['User-Agent'] = random.sample(self.dict, 1)[0] #print self.headers['User-Agent'] #self.proxies = self.ipf.getIp() self.proxies = {'HTTP': 'localhost:8080'} #print self.proxies print '已初始化useragent和代理ip' def getExcel(self, career_Name): wo = self.outwb.active careerSheet = self.outwb.create_sheet(unicode(career_Name), 0) careerSheet.append( ['展示名称', '团购价', '标签', '划线价', '单价', '重量', '单位', '展示名称', '已售数量']) return careerSheet def SaveExcel(self, ExcelName): self.outwb.save(ExcelName + '.xlsx') def getDataByUrl(self, url): print '当前url' + url self.initialization() req = requests.get(url=url, headers=self.headers, proxies=self.proxies).json() return json.dumps(req, ensure_ascii=False) def getAllClassID(self): for Key, Value in self.url_Class.items(): urlDict = {} urldata_ID = json.loads(self.getDataByUrl(Value)) for item in urldata_ID['Data']['Children']: urlDict[item['Name']] = self.getBaseUrl(item['Id']) self.url_Class[Key] = urlDict print '所有子类url组装到url_Class——完毕' #print self.url_Class #获得点开商品后的数据 如 销售量 def getSingleProductData(self, id): data_ = '' try: url = "https://api1.34580.com/sz/product/detail?sourcetype=9&ssuId=%s" % id headers = { 'charset': 'utf-8', 'Accept-Encoding': 'gzip', 'referer': 'https://servicewechat.com/wx6e7ce0c196b0c3c2/32/page-frame.html', 'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; FRD-AL10 Build/HUAWEIFRD-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/68.0.3440.91 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070338) NetType/WIFI Language/zh_CN Process/toolsmp', 'Host': 'api1.34580.com', 'Connection': 'Keep-Alive' } req = requests.get(url=url, headers=headers).json() data = json.loads(json.dumps(req, ensure_ascii=False)) data_ = data['result']['productInfo']['soldNumber'] except: print '销售量失败' print data_ return data_ def getAllClassData(self): self.getAllClassID() for Key, Value in self.url_Class.items(): careerSheet = self.getExcel(Key) #以一级分类为基础建立Excel页 for key, value in Value.items(): careerSheet.append([key]) try: data = json.loads(self.getDataByUrl(value)) if data['Message'] == '返回正确': for item in data['Data']['SourceData']: careerSheet.append([ item['ProductName'], #展示名称 item['UnitPeriodMoney'], # 团购价 item['PriceName'], # 标签 item['DefaultMoney'], # 划线价 item['PvStandard'], # 单价 item['Weight'], # 重量 item['Unit'], # 单位 item['ProductName'], # 展示名称 str( self.getSingleProductData( item['ProductId'])) #已售数量 ]) print data['Message'] except: print 'Fail' def Crawl(self, ExcelName): self.getAllClassData() #组装url self.SaveExcel(ExcelName) #储存
class _OpenpyxlWriter(ExcelWriter): engine = 'openpyxl' supported_extensions = ('.xlsx', '.xlsm') def __init__(self, path, engine=None, **engine_kwargs): if not openpyxl_compat.is_compat(): raise ValueError('Installed openpyxl is not supported at this ' 'time. Use >={0} and ' '<{1}.'.format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver)) # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook super(_OpenpyxlWriter, self).__init__(path, **engine_kwargs) # Create workbook object with default optimized_write=True. self.book = Workbook() # Openpyxl 1.6.1 adds a dummy sheet. We remove it. if self.book.worksheets: self.book.remove_sheet(self.book.worksheets[0]) def save(self): """ Save workbook to disk. """ return self.book.save(self.path) def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter sheet_name = self._get_sheet_name(sheet_name) if sheet_name in self.sheets: wks = self.sheets[sheet_name] else: wks = self.book.create_sheet() wks.title = sheet_name self.sheets[sheet_name] = wks for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) xcell.value = _conv_value(cell.val) style = None if cell.style: style = self._convert_to_style(cell.style) for field in style.__fields__: xcell.style.__setattr__(field, style.__getattribute__(field)) if isinstance(cell.val, datetime.datetime): xcell.style.number_format.format_code = self.datetime_format elif isinstance(cell.val, datetime.date): xcell.style.number_format.format_code = self.date_format if cell.mergestart is not None and cell.mergeend is not None: cletterstart = get_column_letter(startcol + cell.col + 1) cletterend = get_column_letter(startcol + cell.mergeend + 1) wks.merge_cells('%s%s:%s%s' % (cletterstart, startrow + cell.row + 1, cletterend, startrow + cell.mergestart + 1)) # Excel requires that the format of the first cell in a merged # range is repeated in the rest of the merged range. if style: first_row = startrow + cell.row + 1 last_row = startrow + cell.mergestart + 1 first_col = startcol + cell.col + 1 last_col = startcol + cell.mergeend + 1 for row in range(first_row, last_row + 1): for col in range(first_col, last_col + 1): if row == first_row and col == first_col: # Ignore first cell. It is already handled. continue colletter = get_column_letter(col) xcell = wks.cell("%s%s" % (colletter, row)) for field in style.__fields__: xcell.style.__setattr__( field, style.__getattribute__(field)) @classmethod def _convert_to_style(cls, style_dict): """ converts a style_dict to an openpyxl style object Parameters ---------- style_dict: style dictionary to convert """ from openpyxl.style import Style xls_style = Style() for key, value in style_dict.items(): for nk, nv in value.items(): if key == "borders": (xls_style.borders.__getattribute__(nk).__setattr__( 'border_style', nv)) else: xls_style.__getattribute__(key).__setattr__(nk, nv) return xls_style
["help", "file=", "geolitecity=", "excelfile=", "coloring"]) except getopt.GetoptError, err: print str(err) sys.exit(2) for o, a in opts: if o in ("-h", "--help"): usage() sys.exit(0) elif o in ("-e", "--excelfile"): if not has_openpyxl: print "You must install OpenPyxl 2.1.2+ for xlsx format:\n\thttps://pypi.python.org/pypi/openpyxl" sys.exit(-1) excelfile = a wb = Workbook(optimized_write=True) ws = wb.create_sheet() ws.title = "IP Address Info" ws.append(header) elif o in ("-g", "--geolitecity"): GeoLite = a elif o in ("-f", "--file"): if os.path.isfile(a): ipfile = open(a, "r") else: print a + " is not a file" usage() sys.exit(-1) elif o in ("-c", "--coloring"): color = True else: assert False, "unhandled option\n\n"
class ExcelWriter(object): """ Class for writing DataFrame objects into excel sheets, uses xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. Parameters ---------- path : string Path to xls file """ def __init__(self, path): self.use_xlsx = True if path.endswith('.xls'): self.use_xlsx = False import xlwt self.book = xlwt.Workbook() self.fm_datetime = xlwt.easyxf(num_format_str='YYYY-MM-DD HH:MM:SS') self.fm_date = xlwt.easyxf(num_format_str='YYYY-MM-DD') else: from openpyxl.workbook import Workbook self.book = Workbook(optimized_write = True) self.path = path self.sheets = {} self.cur_sheet = None def save(self): """ Save workbook to disk """ self.book.save(self.path) def writerow(self, row, sheet_name=None): """ Write the given row into Excel an excel sheet Parameters ---------- row : list Row of data to save to Excel sheet sheet_name : string, default None Name of Excel sheet, if None, then use self.cur_sheet """ if sheet_name is None: sheet_name = self.cur_sheet if sheet_name is None: # pragma: no cover raise Exception('Must pass explicit sheet_name or set ' 'cur_sheet property') if self.use_xlsx: self._writerow_xlsx(row, sheet_name) else: self._writerow_xls(row, sheet_name) def _writerow_xls(self, row, sheet_name): if sheet_name in self.sheets: sheet, row_idx = self.sheets[sheet_name] else: sheet = self.book.add_sheet(sheet_name) row_idx = 0 sheetrow = sheet.row(row_idx) for i, val in enumerate(row): if isinstance(val, (datetime.datetime, datetime.date)): if isinstance(val, datetime.datetime): sheetrow.write(i,val, self.fm_datetime) else: sheetrow.write(i,val, self.fm_date) elif isinstance(val, np.int64): sheetrow.write(i,int(val)) elif isinstance(val, np.bool8): sheetrow.write(i,bool(val)) else: sheetrow.write(i,val) row_idx += 1 if row_idx == 1000: sheet.flush_row_data() self.sheets[sheet_name] = (sheet, row_idx) def _writerow_xlsx(self, row, sheet_name): if sheet_name in self.sheets: sheet, row_idx = self.sheets[sheet_name] else: sheet = self.book.create_sheet() sheet.title = sheet_name row_idx = 0 conv_row = [] for val in row: if isinstance(val, np.int64): val = int(val) elif isinstance(val, np.bool8): val = bool(val) conv_row.append(val) sheet.append(conv_row) row_idx += 1 self.sheets[sheet_name] = (sheet, row_idx)
class boss_crawler: def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connecion': 'keep-alive', #'Cookie':'__a=32544699.1550979725.1550979725.1550979732.57.2.56.57; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1550979723,1552831132,1552906964; lastCity=101010100; _uab_collina=155283113440544880091396; __g=-; __l=r=&l=%2Fsao.zhipin.com%2F; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1552907207; __c=1550979732; JSESSIONID=""', 'Host': 'www.zhipin.com', 'Referer': 'https://www.zhipin.com/c101010100/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:65.0) Gecko/20100101 Firefox/65.0' } # cookie会失效的,需要定期更换 self.keyword = ['java'] self.outwb = Workbook() def getHtml(self, url): time.sleep(2) response = request('get', url, headers=self.headers) #html = json.loads(response.text) html = response.text return html def getExcel(self, careerName): wo = self.outwb.active careerSheet = self.outwb.create_sheet(careerName) careerSheet.append( ['公司', '规模低', '规模顶', '职位', '最低工作经验', '学历', '底薪', '顶薪', '发布时间']) return careerSheet def saveExcel(self): self.outwb.save( "E:\DataAnalysis\\tools\python3\project\money_analysis\JodFinding\work.xlsx" ) def getNum(self, str): pattern = re.compile(r'\d+') result = pattern.findall(str) return result def getSchool(self, str): p = re.compile(r'[年]+') p_ = re.compile(r'[限]+') p__ = re.compile(r'[生]+') list1 = p.split(str) list2 = p_.split(str) list3 = p__.split(str) if len(list1) > 1: print(list1) return list1[1] elif len(list2) > 1: print(list2) return list2[1] else: try: print(list3) return list3[1] except: print('error') def get_SaveMessage(self, careerSheet, soup): joblist = soup.find_all('div', {'class': 'job-primary'}) for item in joblist: company = item.find('div', { 'class': 'info-company' }).find('a').get_text() big = item.find('div', { 'class': 'info-company' }).find('p').get_text() job = item.find('div', { 'class': 'info-primary' }).find('div', { 'class': 'job-title' }).get_text() graduate = item.find('div', {'class': 'info-primary'}).p.get_text() money = item.find('div', { 'class': 'info-primary' }).find('span').get_text() time = item.find('div', { 'class': 'info-publis' }).find('p').get_text() if len(self.getNum(big)) > 1: if len(self.getNum(graduate)) > 0: careerSheet.append([ company, self.getNum(big)[0], self.getNum(big)[1], job, self.getNum(graduate)[0], self.getSchool(graduate), self.getNum(money)[0], self.getNum(money)[1], time ]) else: careerSheet.append([ company, self.getNum(big)[0], self.getNum(big)[1], job, '无', self.getSchool(graduate), self.getNum(money)[0], self.getNum(money)[1], time ]) else: if len(self.getNum(graduate)) > 0: careerSheet.append([ company, self.getNum(big)[0], '以上', job, self.getNum(graduate)[0], self.getSchool(graduate), self.getNum(money)[0], self.getNum(money)[1], time ]) else: careerSheet.append([ company, self.getNum(big)[0], '以上', job, '无', self.getSchool(graduate), self.getNum(money)[0], self.getNum(money)[1], time ]) def startCrawl(self): for i in range(len(self.keyword)): #按关键字进行爬取 page = 0 careerSheet = self.getExcel(self.keyword[i]) url = 'https://www.zhipin.com/c101010100/?query=%s&page=%d&ka=page-%d' % ( self.keyword[i], page, page) #&period=3 是近7天招聘的意思 soup = BeautifulSoup(self.getHtml(url), 'lxml') self.get_SaveMessage(careerSheet, soup) while soup.find('a', {'class': 'next'}): #如果有下一页 if soup.find('a', {'class': 'next disabled'}): break page = page + 1 url = 'https://www.zhipin.com/c101010100/?query=%s&page=%d&ka=page-%d' % ( self.keyword[i], page, page) soup = BeautifulSoup(self.getHtml(url), 'lxml') self.get_SaveMessage(careerSheet, soup) print(self.keyword[i] + 'is OK') self.saveExcel()