def parse(self, url, data, fromEncoding=None): soup = BeautifulSoup(data, fromEncoding=(fromEncoding or "windows-1252")) title = soup.fetch("title")[0].string fnr = soup.fetch("h1")[0].string.split(".", 1)[0] m = DATE_RE.search(title) year = int(m.group(1)) mon = int(m.group(2)) day = int(m.group(3)) hour = int(m.group(4)) minu = int(m.group(5)) existing = Fundur.objects.filter(fnr=fnr) if existing: fn = existing[0] else: fn = Fundur() fn.titill = title fn.lth, fn.fnr = url_to_lth_fnr(url) fn.dags = "%4.4d-%2.2d-%2.2d %2.2d:%2.2d" % (year, mon, day, hour, minu) fn.save() return ScraperParserHTML.parse(self, url, data, soup=soup)
def parse(self, url, data, fromEncoding=None): soup = ScraperParserHTML.parse(self, url, data.replace('"alt', '" alt'), fromEncoding=(fromEncoding or 'windows-1252')) urlbase, urldir = self.urls(url) for li in soup.fetch('li'): for a in li.fetch('a', {}, False): flokk_url = a.get('href', '') if flokk_url.startswith('/') and '/' not in flokk_url[1:]: stafir = flokk_url[1:] flokk_url = urlbase + stafir try: nafn = a.string self.skilgreina_flokk(nafn, stafir, flokk_url) except: traceback.print_exc() for tr in soup.fetch('tr'): for td in tr.fetch('td', {}, False): for a in td.fetch('a', {}, False): flokk_url = a.get('href', '') if 'thingfl=' in flokk_url: try: stafir = a.string nafn = tr.fetch('span', {'class': 'FyrirsognSv'})[0].string self.skilgreina_flokk(nafn, stafir, flokk_url) except: traceback.print_exc() return True
def parse(self, url, data, fromEncoding=None): soup = ScraperParserHTML.parse(self, url, data, fromEncoding=(fromEncoding or 'windows-1252')) urlbase, urldir = self.urls(url) varamenn = ('tegund=[V]' in url) and True or False # <tr><td ...><nobr><a href="/altext/cv.php4?...">Ossur ...</a> (OS) for tr in soup.fetch('tr'): nr = nafn = stafir = existing = flokkur = flokkabbr = None for td in tr.fetch('td', {}, False): for nobr in td.fetch('nobr', {}, False): for a in nobr.fetch('a'): cv_url = unicode(a.get('href', '')) if cv_url.startswith('/altext/cv.php4'): try: nr = cv_url.rsplit('=', 1)[1] nafn = unicode(a.string) stafir = unicode(SKAMMSTOFUN_RE.search(unicode(nobr)).group(1)) except: traceback.print_exc() # <NOBR><abbr title="Framsoknarflokkur">Framsfl.</abbr> </NOBR> for abbr in nobr.fetch('abbr'): flokkur = unicode(abbr.get('title')) flokkabbr = unicode(abbr.string[:-1]) if nr and nafn and stafir: existing = Thingmadur.objects.filter(stafir=stafir) if existing: thm = existing[0] else: thm = Thingmadur() print 'Thingmadur: %s (%s)' % (nafn, stafir) thm.althingi_id = nr thm.nafn = nafn thm.stafir = stafir thm.url_vefs = urlbase + cv_url[1:] thm.url_mynd = MYND_URL % {'nr': nr} thm.varamadur = varamenn thm.save() fl = Flokkur.objects.filter(abbr=flokkabbr) if fl: fl = fl[0] else: print 'Flokkur: %s (%s)' % (flokkur, flokkabbr) fl = Flokkur(abbr=flokkabbr, nafn=flokkur) fl.save() thmfl = thm.flokkur() if not thmfl or thmfl.abbr != flokkabbr: print 'Added %s to %s' % (stafir, flokkabbr) Flokksseta(flokkur=fl, thingmadur=thm, upphaf=datetime.datetime.now()).save() return True
def parse(self, url, data, fromEncoding=None): soup = ScraperParserHTML.parse(self, url, data, fromEncoding=(fromEncoding or 'windows-1252')) efni = soup.h2 dt = DAGSTIMI_RE.search(soup.title.string) dagstimi = dt.group(1) urlbase, urldir = self.urls(url) ferill = url lth, fnr = None, None for a in soup.fetch('a'): href = a.get('href', a.get('HREF', '')) if not lth and not fnr: l, f = url_to_lth_fnr(href) if l and f: lth, fnr = l, f if ('ferill.pl' in href) and href.startswith('/'): ferill = urlbase + href[1:] for para in soup.fetch('p'): brtt = para.a vote = para.dl if brtt is None and vote: elem = vote.next svar = None ja, nei, fj, sh = [], [], [], [] while elem: name = getattr(elem, 'name', None) if name == 'dt': svar = elem.b.string.replace(' ', '').strip()[:-1] elif name == 'dd': folk = elem.string.replace('.', '').replace('*', '').strip().split(', ') if svar.startswith('j'): ja = folk elif svar.startswith('n'): nei = folk elif name == 'p': satuhja = SATUHJA_RE.search(elem.string or '') fjarstaddir = FJARSTADDIR_RE.search(elem.string or '') if satuhja: sh = satuhja.group(1).replace('.', '').replace('*', '').strip().split(', ') elif fjarstaddir: fj = fjarstaddir.group(1).replace('.', '').replace('*', '').strip().split(', ') if not (elem.string and elem.string.strip()): break elem = elem.next # FIXME: Create Umraeda/Kosning/Atkvaedi objects in DB uid = UMRAEDA_ID_RE.search(url).group(1) updating = Umraeda.objects.filter(uid=uid) if updating: for u in updating: u.delete() print 'Umraeda: %s / %s / %s' % (fnr, lth, uid) print 'J: %s' % ja print 'N: %s' % nei print 'F: %s' % fj print 'S: %s' % sh nu = Umraeda(uid=uid, fundur=Fundur.objects.filter(fnr=fnr, lth=lth)[0], umfang=len(data), timi=dagstimi, efni=efni, url_ferill=ferill, titill=soup.h2.string) nu.save() nk = Kosning(uid=uid, umraeda=nu, titill=soup.h2.string, timi=dagstimi, # FIXME: wrong? url_skjal='') nk.save() thingmenn = {} for svar, folk in (('J', ja), ('N', nei), ('F', fj), ('S', sh)): for stafir in folk: try: thm = Thingmadur.objects.filter(stafir=stafir)[0] thingmenn[stafir] = { 'thm': thm, 'fl': thm.flokkur(), # FIXME: Dags? 'svar': svar } except IndexError: print 'Othekkur thingmadur: %s (%s)' % (stafir, svar) for stafir, info in thingmenn.iteritems(): agree = disagree = 0 if info['svar'] != 'F': for st, nfo in thingmenn.iteritems(): if (nfo['svar'] != 'F') and (nfo['fl'] == info['fl']): if nfo['svar'] == info['svar']: agree += 1 else: disagree += 1 Atkvaedi(kosning=nk, thingmadur=info['thm'], uppreisn=(disagree > agree), atkvaedi=info['svar']).save() info['thm'].drop_caches() nk.sparks(refresh=True) return True
def parse(self, url, data, fromEncoding=None): soup = ScraperParserHTML.parse(self, url, data, fromEncoding=(fromEncoding or "windows-1252")) efni = soup.h2 dt = DAGSTIMI_RE.search(soup.title.string) dagstimi = dt.group(1) urlbase, urldir = self.urls(url) ferill = url lth, fnr = None, None for a in soup.fetch("a"): href = a.get("href", a.get("HREF", "")) if not lth and not fnr: l, f = url_to_lth_fnr(href) if l and f: lth, fnr = l, f if ("ferill.pl" in href) and href.startswith("/"): ferill = urlbase + href[1:] for para in soup.fetch("p"): brtt = para.a vote = para.dl if brtt is None and vote: elem = vote.next svar = None ja, nei, fj, sh = [], [], [], [] while elem: name = getattr(elem, "name", None) if name == "dt": svar = elem.b.string.replace(" ", "").strip()[:-1] elif name == "dd": folk = elem.string.replace(".", "").replace("*", "").strip().split(", ") if svar.startswith("j"): ja = folk elif svar.startswith("n"): nei = folk elif name == "p": satuhja = SATUHJA_RE.search(elem.string or "") fjarstaddir = FJARSTADDIR_RE.search(elem.string or "") if satuhja: sh = satuhja.group(1).replace(".", "").replace("*", "").strip().split(", ") elif fjarstaddir: fj = fjarstaddir.group(1).replace(".", "").replace("*", "").strip().split(", ") if not (elem.string and elem.string.strip()): break elem = elem.next # FIXME: Create Umraeda/Kosning/Atkvaedi objects in DB uid = UMRAEDA_ID_RE.search(unicode(url)).group(1) updating = Umraeda.objects.filter(uid=uid) if updating: for u in updating: u.delete() nu = Umraeda( uid=uid, fundur=Fundur.objects.filter(fnr=fnr, lth=lth)[0], umfang=len(data), timi=dagstimi, efni=unicode(efni), url_ferill=unicode(ferill), titill=unicode(soup.h2.string), ) nu.save() nk = Kosning( uid=uid, umraeda=nu, titill=unicode(soup.h2.string), timi=dagstimi, url_skjal="" # FIXME: wrong? ) nk.save() thingmenn = {} for svar, folk in (("J", ja), ("N", nei), ("F", fj), ("S", sh)): for stafir in folk: try: thm = Thingmadur.objects.filter(stafir=stafir)[0] thingmenn[stafir] = {"thm": thm, "fl": thm.flokkur(), "svar": svar} # FIXME: Dags? except IndexError: print "Othekkur thingmadur: %s (%s)" % (stafir, svar) for stafir, info in thingmenn.iteritems(): agree = disagree = 0 if info["svar"] != "F": for st, nfo in thingmenn.iteritems(): if (nfo["svar"] != "F") and (nfo["fl"] == info["fl"]): if nfo["svar"] == info["svar"]: agree += 1 else: disagree += 1 Atkvaedi( kosning=nk, thingmadur=info["thm"], uppreisn=(disagree > agree), atkvaedi=info["svar"] ).save() info["thm"].drop_caches() nk.sparks(refresh=True) print "Umraeda: %s / %s / %s" % (fnr, lth, uid) return True
def parse(self, url, data, fromEncoding=None): soup = ScraperParserHTML.parse(self, url, data, fromEncoding=(fromEncoding or 'windows-1252')) efni = soup.h2 dt = DAGSTIMI_RE.search(soup.title.string) dagstimi = dt.group(1) lth, fnr = None, None for a in soup.fetch('a'): l, f = url_to_lth_fnr(a.get('href', '')) if l and f: lth, fnr = l, f break for para in soup.fetch('p'): brtt = para.a vote = para.dl if brtt is None and vote: elem = vote.next svar = None ja, nei, fj, sh = [], [], [], [] while elem: name = getattr(elem, 'name', None) if name == 'dt': svar = elem.b.string.replace(' ', '').strip()[:-1] elif name == 'dd': folk = elem.string.replace('.', '').strip().split(', ') if svar.startswith('j'): ja = folk elif svar.startswith('n'): nei = folk elif name == 'p': satuhja = SATUHJA_RE.search(elem.string or '') fjarstaddir = FJARSTADDIR_RE.search(elem.string or '') if satuhja: sh = satuhja.group(1).replace('.', '').strip().split(', ') elif fjarstaddir: fj = fjarstaddir.group(1).replace('.', '').strip().split(', ') if not (elem.string and elem.string.strip()): break elem = elem.next # FIXME: Create Umraeda/Kosning/Atkvaedi objects in DB uid = UMRAEDA_ID_RE.search(url).group(1) updating = Umraeda.objects.filter(uid=uid) if updating: for u in updating: u.delete() print 'Umraeda: %s / %s / %s' % (fnr, lth, uid) print 'J: %s' % ja print 'N: %s' % nei print 'F: %s' % fj print 'S: %s' % sh nu = Umraeda(uid=uid, fundur=Fundur.objects.filter(fnr=fnr, lth=lth)[0], timi=dagstimi, efni=efni, #url_ferill=, titill=soup.h2.string) nu.save() nk = Kosning(umraeda=nu, titill=soup.h2.string, timi=dagstimi, # FIXME: wrong? url_skjal='') nk.save() for svar, folk in (('J', ja), ('N', nei), ('F', fj), ('S', sh)): for stafir in folk: try: Atkvaedi(kosning=nk, thingmadur=Thingmadur.objects.filter(stafir=stafir)[0], atkvaedi=svar).save() except IndexError: print 'Othekkur thingmadur: %s (%s)' % (stafir, svar) return True