def main(): pagedict = { "title": "", 'screen_size': "", 'camera_res': "", 'screen_resolution': "", 'battery_capacity': "", 'ram': "", 'internal_memory': "", 'processor_speed': "", 'weight': "", "body": "" } n = 0 p = Path('files_new/') for folder in p.iterdir(): print(f"Converting folder: {folder.name}...") if folder.is_dir(): for file in folder.iterdir(): if file.is_file() and file.name.endswith(".html"): soup = getsoup(file) specs = specsection(soup) for spec_name, rx in regexes.items(): pagedict[spec_name] = getspec(specs, rx) pagedict["title"] = getsouptitle(soup) pagedict["body"] = getbodytext(soup) qtd_specs = [*filter(lambda val: val, pagedict.values())] if len(qtd_specs) < 8: print(f"skiped {file.name}") continue xml = dicttoxml(pagedict, attr_type=None, custom_root="page") prettyxml = parseString(xml).toprettyxml() print(f"Writing file: {file.name}") write_to_file(prettyxml, f'{n}.xml') n += 1 print(f"Done with {folder.name}!") print("Finished")
def main(): soup = getsoup(PATH) cleansoup = preprocess(soup) specs = getspecs(cleansoup) print(specs)
def main(): soup = getsoup(SPEC_PAGE) cleansoup = preprocess(soup) getspecs(cleansoup)
def main(): soup = getsoup(PHONE_SPECS_PAGE) cleansoup = preprocess(soup) specs = cleansoup.find_all(class_="techspecs-section") specsdict = getspecs(specs) print(specsdict)
def main(): soup = getsoup(SPEC_PAGE) cleansoup = preprocess(soup) items = cleansoup.find_all(class_="tech_spec_wrap spec_toggle") specs = getspecs(items) print(specs)