def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) links = r.html.find(".entry-title") for h2 in links[::-1]: a = h2.find("a", first=True) url = a.attrs.get('href') name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data={})
def sync(obj: Base, *args, **kwargs): cat = kwargs.get("cat", "") typ = kwargs.get("typ", "") url = obj.sync_type.base_url.format(cat=cat, typ=typ) res = Internet.html_get(url) links = res.html.xpath("/html/body/main/section/article[*]/h2/a") for a in links[::-1]: link = a.attrs.get("href").split("?")[0] name = a.text.strip() obj.add_text_task(unique_key=link, name=name, url=link, data={})
def sync(obj: Base, *args, **kwargs): base_url = obj.sync_type.base_url res = Internet.html_get(base_url) h2_list = res.html.find(".post-card-title")[::-1] for h2 in h2_list: a = h2.find("a", first=True) link = urllib.parse.urljoin(base_url, a.attrs.get('href')) print(a.text.strip(), link) obj.add_text_task(unique_key=link, name=a.text.strip(), url=link, data={})
async def test_url(url_test: UrlTest): """ Test given URL """ try: res = Internet.html_get(url_test.url) data = UrlTestOut( url=url_test.url, status_code=res.status_code, body=res.content, headers=res.headers ) except Exception as e: data = str(e) return data
def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) links = r.html.xpath( '/html/body/div[1]/div[2]/div/div[4]/div[2]/section/div[*]/div[*]/div[2]/a' ) for a in links[::-1]: url = a.attrs.get('href').split("?")[0] name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data=dict(text=url))
def sync(obj: Base, *args, **kwargs): res = Internet.html_get(obj.sync_type.base_url) links = res.html.xpath( "/html/body/form/div[4]/div[3]/div/div[1]/div[*]/div/div[1]/h3/a") for a in links[::-1]: link = a.attrs.get('href') url = urllib.parse.urljoin(obj.sync_type.base_url, link) name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data=dict(text=url))
def sync(obj: Base, *args, **kwargs): cat = kwargs.get("cat", "") url = urllib.parse.urljoin(obj.sync_type.base_url, cat) res = Internet.html_get(url) h2_list = res.html.find(".crayons-story__title") for h2 in h2_list[::-1]: a = h2.find('a', first=True) url = urllib.parse.urljoin(obj.sync_type.base_url, a.attrs.get('href')) obj.add_text_task( unique_key=a.attrs.get('id').strip(), name=a.text.strip(), url=url, data=dict(text=url) )
def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) found_links = [] xpaths = obj.sync_type.extras.get("xp") for xpath in xpaths: links = r.html.xpath(xpath) if links: found_links.extend(links) for a in found_links[::-1]: url = a.attrs.get('href').split("?")[0] name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data={})
def sync(obj: Base, *args, **kwargs): r = Internet.html_get(obj.sync_type.base_url) links = [] xpaths = obj.sync_type.extras.get("xp") for xpath in xpaths: links = r.html.xpath(xpath) if links: break article_url = obj.sync_type.extras.get("article_url") for a in links[::-1]: path = a.attrs.get('href') url = urllib.parse.urljoin(article_url, path) name = a.text.strip() obj.add_text_task(unique_key=url, name=name, url=url, data={})
def sync(obj: Base, *args, **kwargs): cat = kwargs.get("cat", "") url = obj.sync_type.base_url.format(cat=cat) res = Internet.html_get(url) xpaths = [ "/html/body/main/div[2]/div/div/div[1]/div/div[2]/div/article[*]/div/div[2]/a", "/html/body/main/div[2]/div/div/div[1]/div/article[*]/div/div[2]/a", ] links = [] for xpath in xpaths: links = res.html.xpath(xpath) if links: break f_url = obj.sync_type.extras.get("base_url") for a in links[::-1]: link = urljoin(f_url, a.attrs.get("href")) name = a.text.strip().replace("\n", "--") obj.add_text_task(unique_key=link, name=name, url=link, data={})