def parse_flat(self, html: Element) -> None: # noqa: CCR001 """Get info about flat. Get all info about flat in given html element. :param html: Given element """ try: flat_url = html.find("a", first=True).attrs.get("href") flat_id = int(re.search(r"flat/(\d+)", flat_url).group(1)) location = html.xpath(".//a[@data-name='GeoLabel']/text()") if self.domain == "ekb": location = location[1:] city, district, *location = location location = " ".join(location) price = html.xpath(".//span[@data-mark='MainPrice']/text()", first=True) price = int(price.replace("₽", "").strip().replace(" ", "")) ppm = html.xpath(".//p[@data-mark='PriceInfo']/text()", first=True) ppm = int(ppm.replace("₽/м²", "").strip().replace(" ", "")) square = round(price / ppm, 2) if not Flat.exists(id=flat_id): Flat( id=flat_id, city=city, district=district, location=location, price=price, ppm=ppm, square=square, ) commit() except Exception as exc: print(exc) rollback()
def yield_pron( request_html: requests_html.Element, ipa_xpath_selector: str, config: "Config", ) -> "Iterator[Pron]": for ipa_element in request_html.xpath(ipa_xpath_selector): m = re.search(config.ipa_regex, ipa_element.text) if not m: continue pron = m.group(1) # Removes parens around various segments. pron = pron.replace("(", "").replace(")", "") if _skip_pron(pron, config.skip_spaces_pron): continue try: # All pronunciation processing is done in NFD-space. pron = unicodedata.normalize("NFD", pron) pron = config.process_pron(pron) except IndexError: logging.info( "IndexError encountered processing %s during scrape of %s", pron, config.language, ) continue if pron: # The segments package inserts a # in-between spaces. if not config.skip_spaces_pron: pron = pron.replace(" #", "") yield pron
def movements(process: requests_html.Element) -> List[Dict]: rows = process.xpath('//tr') result = [] for row in rows: data = [] for col in row.xpath('//td'): data.append(col.text) result.append({'data': data[0], 'movimento': ''.join(data[1:])}) return result
def parts(process_parts: requests_html.Element) -> List[List[Dict]]: rows = process_parts.xpath('//tr') result = [] for row in rows: data = [] values = row.text.replace('\xa0', '').replace(':\n', ':').split('\n') for value in values: value = value.split(':') data.append({value[0]: value[1].strip()}) result.append(data) return result
def general_data(process_general_data: requests_html.Element) -> Dict: result = {} names = [ 'Classe', 'Área', 'Assunto', 'Distribuição', 'Juiz', 'Relator', 'Valor da ação' ] for name in names: field = process_general_data.xpath( f"//tr[contains(string(), '{name}')]", first=True) if field: field = field.text field = field.replace(': ', ':\n') field = field.split(':\n') result[field[0]] = field[1] return result
def yield_pron( request_html: requests_html.Element, ipa_xpath_selector: str, config: "Config", ) -> "Iterator[Pron]": for ipa_element in request_html.xpath(ipa_xpath_selector): m = re.search(config.ipa_regex, ipa_element.text) if not m: continue pron = m.group(1) # Removes parens around various segments. pron = pron.replace("(", "").replace(")", "") if _skip_pron(pron): continue pron = config.process_pron(pron) if pron: yield pron