def parse_reviews(): date_latest = utils.parse_latest_date(REVIEWS_DIR) reviews_latest = f"{REVIEWS_DIR}/{date_latest}" categories = os.listdir(reviews_latest) categories = [c for c in categories if not c.startswith(".")] reviews = [] for category in categories: products = os.listdir(f"{reviews_latest}/{category}") products = [p for p in products if not p.startswith(".")] for product in products: product_id = product[: product.index(".json")] product_reviews = load_json(f"{reviews_latest}/{category}/{product}") for review in product_reviews["data"]: review_dict = { "product_id": product_id, "source_id": review["id"], "date": datetime.strptime(review["date"], "%d.%m.%Y"), "rating": review["rating"], "comment_plus": review["comment"]["plus"], "comment_minus": review["comment"]["minus"], "comment_text": review["comment"]["text"], } review_rating = review["feedback"]["reviewsRating"] approved, rated = _parse_approved_rated(review_rating) review_dict.update({"review_approved": approved, "review_rated": rated}) reviews.append(review_dict) with open(f"{DB_DUMPS_DIR}/reviews.json", "wb") as f: f.write(orjson.dumps(reviews))
def parse_products(): date_latest = utils.parse_latest_date(PRODUCTS_DIR) products_latest = f"{PRODUCTS_DIR}/{date_latest}" categories = os.listdir(products_latest) categories = [c for c in categories if not c.startswith(".")] categories_with_specs = ( "desktops", "notebooks", ) products = [] used_products = set() for category in categories_with_specs: category_products = load_json( f"{SPECS_DIR}/{date_latest}/{category}-list.json") used_products.update({p["source_id"] for p in category_products}) products.extend(category_products) for category in categories: category_products = load_json(f"{products_latest}/{category}") category_products = [p for p in category_products] products.extend(category_products) with open(f"{DB_DUMPS_DIR}/products.json", "w") as f: f.write(json.dumps(products, ensure_ascii=False))
def __init__(self, category): super().__init__() parse_date = parse_latest_date(PRODUCTS_DIR) self.parse_list = f"{PRODUCTS_DIR}/{parse_date}/{category}-list.json" self.output_dir = f"{REVIEWS_DIR}/{date.today()}/{category}" os.makedirs(self.output_dir, exist_ok=True) self.log(f"Parser for {category} reviews has been started.")
def parse_products(): date_latest = utils.parse_latest_date(PRODUCTS_DIR) products_latest = f"{PRODUCTS_DIR}/{date_latest}" categories = os.listdir(products_latest) categories = [c for c in categories if not c.startswith(".")] products = [] for category in categories: category_products = load_json(f"{products_latest}/{category}") products.extend(category_products) with open(f"{DB_DUMPS_DIR}/products.json", "w") as f: f.write(json.dumps(products, ensure_ascii=False))
def parse_specs(): date_latest = utils.parse_latest_date(SPECS_DIR) categories_with_specs = ( "desktops", "notebooks", ) specs = [] for category in categories_with_specs: products = load_json(f"{SPECS_DIR}/{date_latest}/{category}-specs.json") for product in products: specs.append(processed_specs(product)) with open(f"{DB_DUMPS_DIR}/specs.json", "w") as f: json.dump(specs, f, ensure_ascii=False)
def parse_categories(): date_latest = utils.parse_latest_date(f"{PRODUCTS_DIR}") products_latest = f"{PRODUCTS_DIR}/{date_latest}" categories = os.listdir(products_latest) categories = [c for c in categories if not c.startswith(".")] categories_set = set() for category in categories: products = load_json(f"{products_latest}/{category}") for product in products: name = product["category_name"].replace("%20", " ") categories_set.add((name, product["category_id"])) categories_sorted = sorted(categories_set, key=lambda x: (x[0], x[1])) categories_dict = [{"name": c[0], "source_id": c[1]} for c in categories_sorted] with open(f"{DB_DUMPS_DIR}/categories.json", "w") as f: json.dump(categories_dict, f)
def __init__(self, category): super().__init__() parse_date = parse_latest_date(SPECS_DIR) self.products_json = f"{SPECS_DIR}/{parse_date}/{category}-list.json"
def _insert_from_source(source_file, query): db = LocalSession() date = utils.parse_latest_date(DB_DUMPS_DIR) data: List[dict] = load_json(f"{DB_DUMPS_DIR}/{date}/{source_file}") db.bulk_insert_dicts(query, data)