def pre_load(self, data, *args, **kwargs): data = data["exchange-document"] bib = data["bibliographic-data"] del data["bibliographic-data"] data["publications"] = resolve_list( bib, "publication-reference.document-id") ipc_class = resolve_list(bib, "classifications-ipcr.classification-ipcr") data["ipc_classes"] = [c["text"] for c in ipc_class] classifications = resolve_list( bib, "patent-classifications.patent-classification") if classifications: data["cpc_classes"] = [ c for c in classifications if resolve(c, "classification-scheme.@scheme") == "CPCI" ] data["us_classes"] = [ c for c in classifications if resolve(c, "classification-scheme.@scheme") == "UC" ] data["applications"] = resolve_list( bib, "application-reference.document-id") data["priority_claims"] = self.pre_load_priority_claims(bib) data["applicants"] = resolve_list(bib, "parties.applicants.applicant") data["inventors"] = resolve_list(bib, "parties.inventors.inventor") titles = resolve(bib, "invention-title") if isinstance(titles, list): data["title"] = next(t["#text"] for t in titles if t["@lang"] == "en") elif isinstance(titles, str): data["title"] = titles else: data["title"] = titles["#text"] return data
def __len__(self): page = self.get_page(0) max_length = int(resolve(page, "@total-result-count")) limit = self.config["limit"] if limit: return limit if limit < max_length else max_length else: return max_length
def get_page(self, page_number): if page_number not in self.pages: query_params = self.query_params(page_number) response = session.get(self.search_url, params=query_params, timeout=10) data = xmltodict.parse(response.text, process_namespaces=True, namespaces=NS) self.pages[page_number] = resolve( data, "world-patent-data.biblio-search") return self.pages[page_number]
def pre_load_priority_claims(self, bib): pcs = resolve(bib, "priority-claims.priority-claim") out = list() pcs = (pcs if isinstance(pcs, list) else [ pcs, ]) for pc in pcs: if isinstance(pc["document-id"], list): doc_id = pc["document-id"][0] else: doc_id = pc["document-id"] pc = {**pc, **doc_id} del pc["document-id"] out.append(pc) return out
def result_gen(offset, limit): num_pages = math.ceil(len(self) / self.page_size) page_num = int(offset / self.page_size) counter = page_num * self.page_size while page_num < num_pages: page_data = self.get_page(page_num) data = resolve(page_data, self.item_path) if not isinstance(data, list): yield counter, data counter += 1 else: for item in data: if not self.config[ "limit"] or counter < self.config["limit"]: yield counter, item counter += 1 page_num += 1