Пример #1
0
def save_to_xlsx(data,
                 filename='',
                 fieldnames=None,
                 optimize=False,
                 open=False,
                 date_insert=True):
    if not __check_data(data, filename): return None
    data, fieldnames = __init(data, filename, fieldnames, optimize)
    newfilename = _get_new_file_name_with_datetime('.xlsx', filename,
                                                   date_insert)
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(fieldnames)
    i = -1
    for i, each in enumerate(
            data.values() if isinstance(data, dict) else data):
        line = []
        for key in fieldnames:
            value = each.get(key, '')
            if not isinstance(value, (int, float)) or len(str(value)) > 10:
                value = str(value) if not optimize else re.sub(
                    r'\s+', ' ', str(value)).strip()
            line.append(value)
        try:
            ws.append(line)
        except openpyxl.utils.exceptions.IllegalCharacterError:
            print(f'save_to_xlsx: IllegalCharacterError: {line}'
                  )  # Todo only for debug
            ws.append(
                [x.encode('unicode_escape').decode('utf-8') for x in line])
    __view_enhancement(ws)
    wb.save(newfilename)
    print(f"{newfilename} / {i + 1} lines saved / ", end='')
    if open: os.startfile(newfilename)
    return newfilename
Пример #2
0
def delete_file(url):
    file_name = Sw.get_cache_path(url)
    if os.path.exists(file_name):
        os.remove(file_name)
        print(f'=== delete file  {url}')
    else:
        print(f'=== file not found  {url}')
Пример #3
0
def save_to_csv(data,
                filename='',
                fieldnames=None,
                optimize=False,
                open=False,
                date_insert=True,
                SEP=','):
    if not __check_data(data, filename): return None
    QC, NL = '"', '\r\n'  # separator, quote char, new line
    data, fieldnames = __init(data, filename, fieldnames, optimize)
    if SEP == ',': file_extension = '.csv'
    if SEP == '\t': file_extension = '.tsv'
    newfilename = _get_new_file_name_with_datetime(file_extension, filename,
                                                   date_insert)
    with codecs.open(newfilename, 'w', encoding='utf-8') as file:
        file.write(SEP.join([f'{QC}{x}{QC}' for x in fieldnames]) + NL)
        i = -1
        for i, each in enumerate(
                data.values() if isinstance(data, dict) else data):
            line = []
            for key in fieldnames:
                value = each.get(key, '')
                if isinstance(value, float): value = str(value)
                value = str(value) if not optimize else re.sub(
                    r'\s+', ' ', str(value)).strip()
                line.append(value.replace('"', '""'))
            file.write(SEP.join([f'{QC}{x}{QC}' for x in line]) + NL)
    print(f"{newfilename} / {i + 1} lines saved / ", end='')
    if open: os.startfile(newfilename)
    return newfilename
def change_main_column(data, maincolumn):
    result = {}
    for each in data.values():
        result[each[maincolumn]] = each
    print(
        f'change_main_column: {len(data)} lines / {len(result)} loaded / {len(data) - len(result)} lost'
    )
    return result
Пример #5
0
def _copyfile(url, cache_path, new_path, file_type, path):
    if os.stat(cache_path).st_size == 0: return None  # Todo try to download one more time
    if 'images' in path and file_type in ['.jpg', '.png', '.gif']:
        img = PIL.Image.open(cache_path)
        if (img.size[0] + img.size[1]) < 200: return None
    print(f'copy file  {url}', only_debug=True)
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
    shutil.copyfile(cache_path, new_path)
    return True
Пример #6
0
def __generate_fieldnames_optimized(data, fieldnames):
    new_fieldnames = []
    for each in data.values() if isinstance(data, dict) else data:
        if not isinstance(each, dict): raise ValueError('Wrong data')
        for key, value in each.items():
            if value != '' and key not in new_fieldnames:
                new_fieldnames.append(str(key))
    additional_fields = [x for x in new_fieldnames if x not in fieldnames]
    cleared_fields = [x for x in fieldnames if x not in new_fieldnames]
    if cleared_fields: print('deleted columns: ' + ', '.join(cleared_fields))
    return [x for x in fieldnames if x in new_fieldnames] + additional_fields
Пример #7
0
def generate_img(sku, name, path='images', brand='logo'):
    file_name = f'{Sw.good_name(Sw.transliterate(name))}.jpg'
    full_path = f'{path}\\{file_name}'
    if os.path.exists(full_path): return file_name
    print(f'generate img  {full_path}')
    fnt = PIL.ImageFont.truetype('C:\Windows\Fonts\Arial.ttf', 60)
    img = PIL.Image.open(f'{brand.lower().replace(" ", "_")}.png').convert('RGB')
    d = PIL.ImageDraw.Draw(img)
    d.text((60, 880), sku, fill=0, font=fnt)
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
    img.save(full_path, quality=100, optimize=True, progressive=True)
    return file_name
Пример #8
0
def download(url):
    try:
        page = requests.get(url, headers={'User-Agent': fake_useragent.UserAgent().chrome})
    except:
        time.sleep(1)
        try:
            page = requests.get(url, headers={'User-Agent': fake_useragent.UserAgent().chrome})
        except:
            print(f'=== download error  {url}')
            return None
    if page.status_code != 200:
        print(f'=== page status code {page.status_code} for  {url}')
        return None
    return page
Пример #9
0
def _create_web_driver(url):
    global _parsing_web_driver
    chrome_options = selenium.webdriver.ChromeOptions()
    chrome_options.add_argument("--start-maximized")
    prefs = {"profile.managed_default_content_settings.images": 2}
    # prefs = {}
    chrome_options.add_experimental_option("prefs", prefs)
    _parsing_web_driver = selenium.webdriver.Chrome(
        'C:\\Users\\Administrator\\Documents\\_python\\webdriver\\chromedriver.exe',
        options=chrome_options)
    _parsing_web_driver.implicitly_wait(10)
    _parsing_web_driver.get(url)
    input('продолжить?')
    print('дальше')
Пример #10
0
def tmp():  # need for "Optimize imports"
    time()
    urllib()
    bs4()
    Category()
    Deepl()
    FindDigits()
    Html()
    LoadDictFromFile()
    Parsing()
    Product()
    SaveDictToFile()
    Sw()
    WorkWithJSON()
    print()
    datetime()
    quote()
    urljoin()
Пример #11
0
def _xls_import(filename, maincolumn, language, optimize, recognize, title_row,
                first_row):
    res = {}
    sheet = xlrd.open_workbook(filename).sheet_by_index(0)  # Todo
    titles = __titles(__xls_titles(sheet, optimize, title_row), language,
                      optimize)
    index = __find_index(maincolumn, titles)
    first_row = max(first_row, title_row + 1) if title_row else first_row
    for a in range(first_row - 1, sheet.nrows):
        row = [
            __correct(__xlrd_get_value(sheet.cell(a, col)), optimize)
            for col in range(0, len(titles))
        ]
        name = str(row[index] if index is not None else a + 1)
        if name: res[name] = {titles[i]: row[i] for i in range(0, len(titles))}
    rows_count = sheet.nrows - title_row if title_row else sheet.nrows
    print(
        f"{filename} / {rows_count} lines / {len(res)} loaded / {rows_count - len(res)} lost / ",
        end='')
    if recognize: _recognize_data(res)
    return res
Пример #12
0
def download_file_from_web(url, name, path='images'):
    if not name: name = urllib.parse.unquote(url[url.rfind('/') + 1:url.rfind('.')])
    name = Sw.good_name(name)
    cache_path = Sw.get_cache_path(url)
    right_part = url[url.rfind('/') + 1:]
    if '?' in right_part: right_part = right_part[:right_part.rfind('?')]
    file_type = right_part[right_part.rfind('.'):].lower() if '.' in right_part else ''
    if file_type == '.jpeg': file_type = '.jpg'
    if 'treston' in path and not file_type: file_type = '.pdf'
    if len(file_type) > 4 or not file_type:
        print(f'=== bad file_type "{file_type}" in url  {url}')
        return ''
        # raise ValueError
    new_path = f'{path}\\{name}{file_type}'
    if os.path.exists(cache_path) and os.path.exists(new_path):
        if os.stat(cache_path).st_size == os.stat(new_path).st_size:
            print(f'do nothing  {url}', only_debug=True)
        else:
            # print(f'st_size: {os.stat(cache_path).st_size} != {os.stat(new_path).st_size}')
            if _copyfile(url, cache_path, new_path, file_type, path) is None: return ''
    elif os.path.exists(cache_path) and not os.path.exists(new_path):
        if _copyfile(url, cache_path, new_path, file_type, path) is None: return ''
    else:
        print(f'download file  {url}')
        urllib3.disable_warnings()
        page = download(url)
        if page is None: return ''
        if '\\' in cache_path: pathlib.Path(cache_path[:cache_path.rfind('\\')]).mkdir(parents=True, exist_ok=True)
        with open(cache_path, 'wb') as file:
            file.write(page.content)
        if _copyfile(url, cache_path, new_path, file_type, path) is None: return ''
    return new_path.replace(f'{path}\\', '').lower()
Пример #13
0
def _xlsx_import(filename, maincolumn, language, optimize, recognize,
                 title_row, first_row):
    res = {}
    sheet = openpyxl.load_workbook(filename).active
    titles = __titles(__xlsx_titles(sheet, optimize, title_row), language,
                      optimize)
    index = __find_index(maincolumn, titles)
    first_row = max(first_row, title_row + 1) if title_row else first_row
    for a in range(first_row, sheet.max_row + 1):
        row = [
            __correct(sheet.cell(row=a, column=col).value, optimize)
            for col in range(1,
                             len(titles) + 1)
        ]
        name = str(row[index] if index is not None else a)
        if name: res[name] = {titles[i]: row[i] for i in range(0, len(titles))}
    rows_count = sheet.max_row - title_row if title_row else sheet.max_row
    print(
        f"{filename} / {rows_count} lines / {len(res)} loaded / {rows_count - len(res)} lost / ",
        end='')
    if recognize: _recognize_data(res)
    return res
Пример #14
0
def _csv_import(filename, maincolumn, language, optimize, recognize, delimiter,
                title_row, first_row):
    res = {}
    with codecs.open(filename, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=delimiter, quotechar='"')
        data = [row for row in reader]
    titles = __titles(data[title_row - 1], language,
                      optimize) if title_row else alphabet(len(data[0]))
    index = __find_index(maincolumn, titles)
    for a, row in enumerate(data[first_row - 1:]):
        if not len(row): continue
        name = str(
            __correct(row[index], optimize) if index is not None else a + 2)
        if name:
            res[name] = {
                titles[i]: __correct(row[i], optimize)
                for i in range(0, len(titles))
            }
    print(
        f"{filename} / {len(data) - 1} lines / {len(res)} loaded / {len(data) - 1 - len(res)} lost / ",
        end='')
    if recognize: _recognize_data(res)
    return res
Пример #15
0
def get_htmls_from_webdriver(url, file_name, additional_func=None):
    global _parsing_web_driver
    if not _parsing_web_driver: _create_web_driver(url)
    try:
        _parsing_web_driver.get(url)
    except:
        try:
            _parsing_web_driver.get(url)
        except:
            print(f'=== ошибка скачивания  {url}')
            return ''
    time.sleep(2)
    html_text = _parsing_web_driver.page_source

    # if '404 - File or directory not found.' in html_text:
    #     print(f'=== code 404  {url}')
    #     return ''

    if additional_func is not None:
        additional_func(_parsing_web_driver)
        html_text = _parsing_web_driver.page_source
    save_text_to_file(file_name, html_text)
    return html_text
Пример #16
0
def load(filename,
         maincolumn=None,
         language=None,
         optimize=False,
         recognize=False,
         delimiter=',',
         title_row=1,
         first_row=2):
    if filename.endswith('.csv'):
        return _csv_import(filename, maincolumn, language, optimize, recognize,
                           delimiter, title_row, first_row)
    elif filename.endswith('.xls'):
        return _xls_import(filename, maincolumn, language, optimize, recognize,
                           title_row, first_row)
    elif filename.endswith('.xlsx') or filename.endswith('.xlsm'):
        try:
            return _xlsx_import(filename, maincolumn, language, optimize,
                                recognize, title_row, first_row)
        except KeyError:
            print('Error: bad file format. Will try to use xls instead')
            return _xls_import(filename, maincolumn, language, optimize,
                               recognize, title_row, first_row)
    else:
        raise ValueError(f'Wrong filetype: {filename}')
Пример #17
0
def check_sku(sku, url, good_symbols=' .-+/'):
    if not sku:
        print(f'=== no sku  {url}')
        return False
    if len(sku) > 21: print(f'=== длинный артикул  {sku} {url}')
    for each in sku:
        if each.isalpha() or each.isdecimal() or each in good_symbols:
            continue
        else:
            print(f'=== спец.символы в артикуле  {sku} {url}')
            return False
    return True
Пример #18
0
 def wrapper2(*args, **kwargs):
     start_time = time.time()
     start_file_name = inspect.getfile(func)
     if '/' in start_file_name: start_file_name = start_file_name[start_file_name.rfind('/') + 1:]
     if '\\' in start_file_name: start_file_name = start_file_name[start_file_name.rfind('\\') + 1:]
     SwPrint.SwPrint(debug=debug, prj_name=start_file_name)
     print('start')
     result = func(*args, **kwargs)
     global _parsing_web_driver
     if _parsing_web_driver: _parsing_web_driver.quit()
     print(f'done in {GlobalFunctions.generate_time_string(time.time() - start_time)}')
     print(f'end')
     SwPrint.SwPrint.save_log_to_file()
     return result
Пример #19
0
def get_htmls_from_web(url, simple=False, additional_func=None):
    result = []
    file_name = Sw.get_cache_path(url, html=True)
    if os.path.exists(file_name):
        print(f'use exist  {url}', only_debug=True)
        result.append(read_file(file_name))
        i = 1
        while True:
            file_name_dop = f'{file_name}_{i + 1}.html'
            if not os.path.exists(file_name_dop): break
            print(f'use exist dop  {file_name_dop}', only_debug=True)
            result.append(read_file(file_name_dop))
            i += 1
    else:
        print(f'download  {url}')
        if simple:
            result.append(get_simple_html(url, file_name))
        else:
            result.append(get_htmls_from_webdriver(url, file_name, additional_func))
    return result
Пример #20
0
def correct_images_sources(soup, source_url=''):
    for tag in soup.find_all('img'):
        src = urllib.parse.urljoin(source_url, tag.get('src'))
        print(f'got image  {src}')
        tag.attrs.clear()
        tag.attrs['src'] = src
Пример #21
0
def __change_file_type(file_name, file_type):
    if '.' in file_name:
        file_name = file_name.replace(file_name[file_name.rfind('.'):], f'.{file_type}')
    else:
        print(f'=== download_imgs error - no file type {file_name}')
    return file_name
Пример #22
0
def __check_data(data, filename):
    if data: return True
    print(f'{filename} / nothing to save / ', end='')
    return False