def main(): LOAD_ = True start = time() yplib.setUp() yplib.set_debugging(False) if LOAD_: GeoUKRSubject.objects.all().delete() yplib.get('http://en.wikipedia.org/wiki/ISO_3166-2:UA') soup = yplib.soup() tbl = soup.find('table', {'class': "wikitable sortable"}) rows = tbl.findAll('tr') for row in rows: cells = row.findAll('td') if cells: subject = GeoUKRSubject(country_iso='UA', geoname_id=0) cell = cells[1] a = cell.find('a') if a: subject.ascii_name = a.text subject.name = '' fullcode = cells[0].text.split('-') subject.code = fullcode[1] subject.save() elapsed = time() - start print "Elapsed time -->", elapsed
def check_user_profile(geocacher): url = 'http://www.geocaching.su/profile.php?pid=%s'%geocacher.pid loaded = False cnter = 0 fh = open('cant_open_profile.txt', 'w') while not loaded and cnter < 100: try: yplib.get(url) loaded = True except BrowserStateError: cnter += 1 fh.close() if not loaded: print('cannot go to %s' % url) fh.write(url) return False soup=yplib.soup() tbl = soup.find('table', {'class':'pages'}) rows = tbl.findAll('tr') all_cells = [] for row in rows: cells = row.findAll('th') for cell in cells: all_cells.append(cell.text.encode('utf8')) user = Cacher() user.pid = geocacher.pid user.uid = get_uid(tbl) user.nickname = text_or_none(all_cells[1]) user.name = text_or_none(all_cells[2]) user.birstday = strdate_or_none(all_cells[3]) user.sex = sex_or_none(all_cells[4]) user.country = text_or_none(all_cells[5]) user.oblast = text_or_none(all_cells[6]) user.town = text_or_none(all_cells[7]) user.phone = text_or_none(all_cells[9]) user.icq = text_or_none(all_cells[10]) if user.icq and not user.icq.isdigit(): user.icq = None user.web = text_or_none(all_cells[11]) gps = text_or_none(all_cells[15]) user.gps = None#gps[:255].encode if gps else None user.created_caches = int_or_none(all_cells[18]) user.found_caches = int_or_none(all_cells[19]) user.photo_albums = int_or_none(all_cells[21]) if len(all_cells) > 23: user.register_date = date_or_none(all_cells[-3]) if user.register_date is None: user.register_date = date_or_none(all_cells[-2]) user.last_login = date_or_none(all_cells[-2]) user.forum_posts = int_or_none(all_cells[-1]) geocacher.__dict__.update(user.__dict__) print('save', geocacher.pid) geocacher.save() return True
def main(): LOAD_ = True start = time() yplib.setUp() yplib.set_debugging(False) if LOAD_: GeoRUSSubject.objects.all().delete() yplib.get('http://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%B4%D1%8B_%D1%81%D1%83%D0%B1%D1%8A%D0%B5%D0%BA%D1%82%D0%BE%D0%B2_%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE%D0%B9_%D0%A4%D0%B5%D0%B4%D0%B5%D1%80%D0%B0%D1%86%D0%B8%D0%B8') soup=yplib.soup() tbl = soup.find('table', {'class': "sortable standard"}) rows = tbl.findAll('tr') for row in rows: cells = row.findAll('td') print cells if cells: subject = GeoRUSSubject(country_iso='RU', geoname_id=0) cell = cells[0] a = cell.find('a') if a: subject.name = a.text subject.ascii_name = cells[1].text subject.code = cells[2].text subject.gai_code = cells[3].text subject.iso_3166_2_code = cells[4].text subject.save() elapsed = time() - start print "Elapsed time -->", elapsed
def add_geocacher(pid): url = 'http://www.geocaching.su/profile.php?pid=%s' % pid print url try: yplib.get(url) except BrowserStateError: pass try: soup = yplib.soup() except UnicodeDecodeError: print 'exception, pid=%s' % pid return tbl = soup.find('table', {'class': 'pages'}) rows = tbl.findAll('tr') all_cells = [] for row in rows: cells = row.findAll('th') for cell in cells: all_cells.append(cell.text.encode('utf8')) user = Cacher() user.pid = pid user.uid = get_uid(tbl) user.nickname = text_or_none(all_cells[1]) if user.nickname: print user.nickname if user.nickname: user.nickname = user.nickname[:64] user.name = text_or_none(all_cells[2]) user.birstday = strdate_or_none(all_cells[3]) user.sex = sex_or_none(all_cells[4]) user.country = text_or_none(all_cells[5]) user.oblast = text_or_none(all_cells[6]) user.town = text_or_none(all_cells[7]) user.phone = text_or_none(all_cells[9]) user.icq = text_or_none(all_cells[10]) if user.icq and not user.icq.isdigit(): user.icq = None user.web = text_or_none(all_cells[11]) if user.web: user.web = user.web[:128] user.gps = None #gps[:255].encode if gps else None user.created_caches = int_or_none(all_cells[18]) user.found_caches = int_or_none(all_cells[19]) user.photo_albums = int_or_none(all_cells[21]) if len(all_cells) > 23: user.register_date = date_or_none(all_cells[-3]) if user.register_date is None: user.register_date = date_or_none(all_cells[-2]) user.last_login = date_or_none(all_cells[-2]) user.forum_posts = int_or_none(all_cells[-1]) geocacher = Geocacher.objects.create(pid=pid) geocacher.__dict__.update(user.__dict__) print 'save', geocacher.pid if user.web: print user.web geocacher.save() return True
def check_cach(cach_pid): def get_coordinates(cell): coordinates = cell.text parts = t2.findall(coordinates)[0] if len(parts) == 4: ns_degree, ns_minute, ew_degree, ew_minute = parts parts = t3.findall(coordinates) NS = parts[0] parts = t4.findall(coordinates) EW = parts[0] return ns_degree, ns_minute, ew_degree, ew_minute, NS, EW def get_type(cell): return cell.text def get_class(cell): class_ = None if cell: parts = cell.contents items = [] for p in parts: txt = p.string if txt and nottag(txt): items.append(txt) class_ = ';'.join(items) return class_ def get_mestnost(cell): oblast = country = None parts = cell.contents if len(parts): country = parts[0] if len(parts) > 2: oblast = parts[2] return country, oblast def get_dostupnost(cell): parts = cell.contents dostupnost = parts[0].split(':')[1].strip() mestnost = parts[2].split(':')[1].strip() return dostupnost, mestnost def get_town(cell): return cell.text def get_grade(cell): grade = None if cell.img: grade = cell.img.get('title') return grade def get_attributes(element): attr = None items = [] imgs = element.findAll('img') for img in imgs: if 'images/attrib/' in img.get('src'): items.append(img.get('title')) attr = ';'.join(items) return attr url = 'http://www.geocaching.su/?pn=101&cid=%d'%int(cach_pid) try: yplib.get(url) except: print 'exception' return False soup=yplib.soup() h = soup.find('h1', {'class':'hdr'}) t = re.compile('([^\[]+)\[.+\]') t1 = re.compile('[^\[]+\[([^\[\]]+\/[^\[\]]+)\]') t2 = re.compile('[N,S]\s(\d+)\&\#176\;\s([\d\.]+).+[E,W]\s(\d+)\&\#176\;\s([\d\.]+)') t3 = re.compile('([N,S]\s\d+\&\#176\;\s[\d\.]+.)') t4 = re.compile('([E,W]\s\d+\&\#176\;\s[\d\.]+.)') t5 = re.compile('WinPopup\(\'profile\.php\?pid\=(\d+)') name = None items = t.findall(h.text) if items: name = items[0] full_code = None items = t1.findall(h.text) if items: full_code = items[0] type_code, pid = full_code.split('/') tbl = soup.find('table', attrs={'cellpadding':3, 'width':160}) rows = tbl.findAll('tr') ns_degree = ns_minute = ew_degree = ew_minute = NS = EW = None country = oblast = town = None dostupnost = mestnost = None cach_type = cach_class = None grade = attr = None act = None for row in rows: tds = row.findAll('td') ths = row.findAll('th') td = None if tds: td = tds[0] cell = None if act: if ths: cell = ths[0] elif tds: cell = tds[1] if act == 'coord': ns_degree, ns_minute, ew_degree, ew_minute, NS, EW = get_coordinates(cell) if act == 'mestnost': country, oblast = get_mestnost(cell) if act == 'dostupnost': dostupnost, mestnost = get_dostupnost(cell) if act == 'town': town = get_town(cell) if act == 'grade': grade = get_grade(cell) act = None if td and td.text.startswith(u'Тип:'): cach_type = get_type(tds[1]) act = None if td and td.text.startswith(u'Класс:'): cach_class = get_class(tds[1]) act = None if td and td.text.startswith(u'КООРДИНАТЫ'): act = 'coord' if td and td.text.startswith(u'МЕСТНОСТЬ'): act = 'mestnost' if td and td.text.startswith(u'БЛИЖАЙШИЙ'): act = 'town' if td and td.text.startswith(u'ОЦЕНКИ'): act = 'dostupnost' if td and td.text.startswith(u'РЕЙТИНГ'): act = 'grade' if td and td.text.startswith(u'АТРИБУТЫ'): attr = get_attributes(tbl) act = None created_by = created_date = changed_date = coauthors = None div = soup.findAll('div', attrs={'style':'padding: 5px; font-family: Verdana; font-weight: bold;'})[0] a = div.a if a: onclick = a.get('onclick') if onclick: pid = t5.findall(onclick) if pid: created_by = int(pid[0]) parts = div.contents for p in parts: txt = p.string #if txt: #print txt.encode('utf8'), type(txt) if txt and nottag(txt): txt = txt.string.strip() if txt.startswith(u'Создан:'): items = txt.split() if len(items) == 2: created_date = items[1] if created_date: day, month, year = [int(s) for s in created_date.split('.')] created_date = date(year, month, day) if txt.startswith(u'(отредактирован'): txt = txt[1:-1] items = txt.split() if len(items) == 2: changed_date = items[1] if changed_date: day, month, year = [int(s) for s in changed_date.split('.')] changed_date = date(year, month, day) if txt.startswith(u'Компаньоны:'): coauthors = 'yes' the_cach = TheCach() the_cach.pid = cach_pid the_cach.code = '%s%s' % (type_code, the_cach.pid) the_cach.type_code = type_code #print #print cach.pid #print '|%s|'%the_cach.code.encode('utf8') the_cach.name = text_or_none(name) the_cach.cach_type = text_or_none(cach_type) the_cach.cach_class = text_or_none(cach_class) the_cach.loc_NS = char_or_none(NS) the_cach.loc_EW = char_or_none(EW) the_cach.loc_NS_degree = int_or_none(ns_degree) the_cach.loc_EW_degree = int_or_none(ew_degree) the_cach.loc_NS_minute = float_or_none(ns_minute) the_cach.loc_EW_minute = float_or_none(ew_minute) the_cach.country = text_or_none(country) the_cach.oblast = text_or_none(oblast) the_cach.town = text_or_none(town) the_cach.dostupnost = int_or_none(dostupnost) the_cach.mestnost = int_or_none(mestnost) the_cach.grade = float_or_none(grade) the_cach.cach_attr = text_or_none(attr) the_cach.created_by = created_by the_cach.created_date = created_date the_cach.changed_date = changed_date the_cach.coauthors = coauthors print the_cach.name.encode('utf8') geocache = get_object_or_none(Cach, pid=cach_pid) if geocache is not None: update_geocache(geocache, the_cach) else: cach = Cach.objects.create(pid=cach_pid) cach.__dict__.update(the_cach.__dict__) print 'save', cach.pid cach.save() #nc += 1 #if True: #cach.__dict__.update(the_cach.__dict__) #print 'save', cach.pid #cach.save() return True
def geocacher_format_insert_string(pid): # try open profile fields = str(pid) url = 'http://www.geocaching.su/profile.php?pid={}'.format(pid) loaded = False cnter = 0 fh = open('cant_open_profile.txt', 'w') while not loaded and cnter < 100: try: yplib.get(url) loaded = True except BrowserStateError: cnter += 1 if not loaded: print('cannot go to %s' % url) fh.write(url) return False fh.close() # processing profile soup=yplib.soup() tbl = soup.find('table', {'class':'pages'}) rows = tbl.findAll('tr') all_cells = [] theuser = {} for row in rows: cells = row.findAll('th') for cell in cells: all_cells.append(cell.text.encode('utf8')) title_cells = row.findAll('td') data_cells = row.findAll('th') if len(title_cells) == 1: title_cell = title_cells[0] title = title_cell.text data = '' if len(data_cells): data_cell = data_cells[-1] data = data_cell.text if title.startswith(u'Псевдоним:'): theuser['nickname'] = data continue if title.startswith(u'Страна:'): theuser['country'] = data continue if title.startswith(u'Область:'): theuser['oblast'] = data continue if title.startswith(u'Нас.пункт'): theuser['town'] = data continue if title.startswith(u'Создал тайников:'): theuser['created'] = data continue if title.startswith(u'Нашел тайников:'): theuser['found'] = data continue if title.startswith(u'Рекомендовал тайников:'): theuser['recommended'] = data continue if title.startswith(u'Фотоальбомы:'): theuser['photo_albums'] = data continue if title.startswith(u'Был на сайте'): theuser['last_visited'] = data continue if title.startswith(u'Дата регистрации:'): theuser['registered'] = data continue if title.startswith(u'Сообщений в форумах:'): theuser['forum_posts'] = data #print theuser uid = get_uid(tbl) fields += ',{}'.format(int_field(uid)) #uid # pid uid nickname name birstday sex country oblast town phone icq web created_caches found_caches photo_albums register_date last_login forum_posts fields += ',{}'.format(text_field(theuser.get('nickname') or '')) #nickname fields += ',{}'.format(text_field(all_cells[2])) #name fields += ',{}'.format(date_field(all_cells[3])) #birstday fields += ',{}'.format(sex_field(all_cells[4])) #sex fields += ',{}'.format(text_field(theuser.get('country') or '')) #country fields += ',{}'.format(text_field(theuser.get('oblast') or '')) #oblast fields += ',{}'.format(text_field(theuser.get('town') or '')) #town fields += ',{}'.format(text_field(all_cells[9])) #phone fields += ',{}'.format(int_field(theuser.get('created') or 0)) #created_caches fields += ',{}'.format(int_field(theuser.get('found') or 0)) #found_caches fields += ',{}'.format(int_field(theuser.get('photo_albums') or 0)) #photo_albums #register_date = None #last_login = None #forum_posts = None #if len(all_cells) > 23: #register_date = date_or_none(all_cells[-3]) #if register_date is None: #register_date = date_or_none(all_cells[-2]) #last_login = date_or_none(all_cells[-2]) #forum_posts = int_or_none(all_cells[-1]) #import pdb; pdb.set_trace() fields += ',{}'.format(date_field(theuser.get('registered') or '')) #register_date fields += ',{}'.format(date_field(theuser.get('last_visited') or '')) #last_login fields += ',{}'.format(int_field(theuser.get('forum_posts') or 0)) #forum_posts return "({})".format(fields).replace('%', '%%')
def main(processed_pid): #if not switch_off_status_updated(): #return False LOAD_GEO_LOCATION = True start = time() if LOAD_GEO_LOCATION: #.filter(pid=5408) #for cach in Cach.objects.all().filter(pid__gt=processed_pid).order_by('pid')[:1990]: for cach in Cach.objects.all().extra(where=["country_code IS NULL OR admin_code IS NULL OR admin_code='777'"]).order_by('pid')[:1000]: lat = cach.latitude_degree lng = cach.longitude_degree if lat is not None and lng is not None: d = ((0,0), (0.01,0), (-0.01,0), (0,0.01), (0,-0.01)) cnt = 0 while cnt < 5: url = 'http://api.geonames.org/countrySubdivision?username=galdor&lat=%s&lng=%s&lang=en' % (lat+d[cnt][0], lng+d[cnt][1]) print print cach.pid, url yplib.get(url) try: soup = yplib.soup() except: url = 'http://api.geonames.org/countrySubdivision?username=galdor&lat=%s&lng=%s&lang=en' % (lat+d[cnt][0], lng+d[cnt][1]) yplib.get(url) try: soup = yplib.soup() except: soup = None if soup: item = soup.find('countrysubdivision') if item: break cnt += 1 if soup is None: print cach.pid, lat, lng, cach.loc_NS, cach.loc_NS_degree, cach.loc_NS_minute, cach.loc_EW, cach.loc_EW_degree, cach.loc_EW_minute continue item = soup.find('countrycode') if item and item.text: cach.country_code = item.text.encode('utf8') if soup.admincode1 and soup.admincode1.text: cach.admin_code = soup.admincode1.text item = soup.find('code', {'type':'FIPS10-4'}) if item: cach.code_fips10_4 = item.text item = soup.find('code', {'type':'ISO3166-2'}) if item: cach.code_iso3166_2 = item.text item = soup.find('countryname') if item: cach.country_name = item.text.encode('cp1251') if soup.adminname1: cach.oblast_name = soup.adminname1.text.encode('cp1251') #print cach.pid, cach.country_name, cach.country_code, cach.oblast_name #print soup #print #print cach.pid if cach.country_code and len(cach.country_code) == 2: cach.save() else: print cach.pid, lat, lng, cach.loc_NS, cach.loc_NS_degree, cach.loc_NS_minute, cach.loc_EW, cach.loc_EW_degree, cach.loc_EW_minute count_without_country = Cach.objects.filter(country_code__isnull=True).count() count_without_subject = Cach.objects.filter(admin_code__isnull=True).count() print '%s have no country' % count_without_country print '%s have no country subject' % count_without_subject sql = "UPDATE cach SET admin_code='777', oblast_name='undefined subject' WHERE country_code IS NOT NULL AND admin_code IS NULL" r = execute_query(sql) sql = """SELECT COUNT(*) FROM cach WHERE country_code IS NULL""" undefined_country_count = sql2val(sql) sql = """SELECT COUNT(*) FROM cach WHERE admin_code IS NULL OR admin_code = '777'""" undefined_subject_count = sql2val(sql) undefined_count = '%s/%s' % (undefined_country_count, undefined_subject_count) elapsed = time() - start print "Elapsed time -->", elapsed #switch_on_status_updated() log('gcsu_location', 'OK %s'%undefined_count)
def main(): #if not switch_off_status_updated(): #return False LOAD_CACHES = True LOAD_GEO_LOCATION = False start = time() yplib.setUp() yplib.set_debugging(False) r = yplib.post2('http://www.geocaching.su/?pn=108', (('Log_In', 'Log_In'), ('email', '*****@*****.**'), ('passwd', 'zaebalixakeryvas'), ('longterm', '1'))) soup = yplib.soup() a = soup.find('a', attrs={'class': "profilelink"}, text='galdor') if not a: print 'Authorization failed' return False print 'OK' if LOAD_CACHES: r = yplib.get('http://www.geocaching.su/site/popup/selex.php') soup = yplib.soup() #print soup #html = yplib.show() chbox_list = soup.findAll('input', type='checkbox') regions = [] #print chbox_list print for chbox in chbox_list: #print chbox.get('value') v = chbox.get('value') if v and chbox.get('name', '') == 'point[]': regions.append(v) print print regions data = [ ('translit', '0'), ('fmt', 'wpt'), ('code_to_name', '1'), ('finded', '2'), ] for r in regions: data.append(('point[]', r)) print print data print r = yplib.post2('http://www.geocaching.su/site/popup/export.php', data) soup = yplib.soup() txt = soup.text print txt return Cach.objects.all().delete() cntr_list = [] t = re.compile('\<td\>(\w\w\d+)\<\/td\>') for p in range(100): item_list = [] r = yplib.post2('http://www.geocaching.su/?pn=101', (('sort', '1'), ('page', str(p)), ('in_page', '100'), ('finded', '1'), ('y', '0'), ('x', '0'), ('updown', '1'))) html = yplib.show() code_list = t.findall(html) for code in code_list: pid = code[2:] item_list.append({'id': pid, 'code': code}) if item_list == cntr_list: break else: cntr_list = item_list check_cach_list(item_list) #check_cach_list([{'id': 2746, 'code': 'EX2746'}]) #break if LOAD_GEO_LOCATION: #.filter(pid=5408) for cach in Cach.objects.all(): lat = cach.latitude_degree lng = cach.longitude_degree if lat is not None and lng is not None: url = 'http://ws.geonames.org/countrySubdivision?lat=%s&lng=%s&lang=ru' % ( lat, lng) print print cach.pid, url yplib.get(url) try: soup = yplib.soup() except: url = 'http://ws.geonames.org/countrySubdivision?lat=%s&lng=%s&lang=en' % ( lat, lng) yplib.get(url) soup = yplib.soup() item = soup.find('countrycode') if item: cach.country_code = item.text.encode('utf8') if soup.admincode1: cach.admin_code = soup.admincode1.text item = soup.find('code', {'type': 'FIPS10-4'}) if item: cach.code_fips10_4 = item.text item = soup.find('code', {'type': 'ISO3166-2'}) if item: cach.code_iso3166_2 = item.text item = soup.find('countryname') if item: cach.country_name = item.text.encode('cp1251') if soup.adminname1: cach.oblast_name = soup.adminname1.text.encode('cp1251') print cach.pid, cach.country_name, cach.oblast_name #print soup #print #print cach.pid cach.save() else: print cach.pid, lat, lng, cach.loc_NS, cach.loc_NS_degree, cach.loc_NS_minute, cach.loc_EW, cach.loc_EW_degree, cach.loc_EW_minute switch_on_status_updated() log('gcsu_caches', 'OK') elapsed = time() - start print "Elapsed time -->", elapsed
def main(): if not switch_off_status_updated(): return False LOAD_CREATED_CACHE_LOGS = False LOAD_SEEK_CACHE_LOGS = False LOAD_RECOMMEND_CACHE_LOGS = False LOAD_PHOTOALBUM_LOGS = False start = time() yplib.setUp() yplib.set_debugging(False) r = yplib.post2('http://www.geocaching.su/?pn=108', (('Log_In', 'Log_In'), ('email', '*****@*****.**'), ('passwd', 'zaebalixakeryvas'), ('longterm', '1'))) soup = yplib.soup() a = soup.find('a', attrs={'class': "profilelink"}, text='galdor') if not a: print 'Authorization failed' return False print print 'BEGIN' if LOAD_CREATED_CACHE_LOGS: LogCreateCach.objects.all().delete() print 'delete create logs' cachers = Geocacher.objects.all() print cachers.count() t = re.compile('\?pn\=101\&cid=(\d+)') t1 = re.compile(u'создан\s+(\d\d\.\d\d\.\d\d\d\d)') for cacher in cachers: if cacher.uid: print cacher.pid, cacher.uid url = 'http://www.geocaching.su/site/popup/userstat.php?s=1&uid=%s' % cacher.uid try: yplib.get(url) except BrowserStateError: continue soup = yplib.soup() tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: #print tbl rows = tbl.findAll('tr') #print len(rows) for row in rows: cach_pid = created_date = None coauthor = False cell = row.find('td') if cell: #print cell a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text print cacher.pid, cach_pid, txt.encode('utf8') if u'(соавтор)' in txt: coauthor = True found = t1.findall(txt) if found: created_date = found[0] created_date = date_or_none(created_date) if cach_pid: the_log = LogCreateCach( author_pid=cacher.pid, cach_pid=cach_pid) the_log.created_date = created_date the_log.coauthor = coauthor the_log.save() print 'saved' if LOAD_SEEK_CACHE_LOGS: LogSeekCach.objects.all().delete() cachers = Geocacher.objects.all() t = re.compile('\?pn\=101\&cid=(\d+)') t1 = re.compile(u'создан\s+(\d\d\.\d\d\.\d\d\d\d)') t2 = re.compile(u'найден\s+(\d\d\.\d\d\.\d\d\d\d)') t3 = re.compile(u'оценен\s+на\s+(\d)') fh = open('cant_open_userstat.txt', 'w') for cacher in cachers: if cacher.uid: print cacher.pid, cacher.uid url = 'http://www.geocaching.su/site/popup/userstat.php?s=2&uid=%s' % cacher.uid loaded = False cnter = 0 while not loaded and cnter < 100: try: yplib.get(url) soup = yplib.soup() loaded = True except BrowserStateError: cnter += 1 if not loaded: print 'cannot go to %s' % url fh.write(url) tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = found_date = grade = None cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text found = t3.findall(txt) if found: g = found[0] grade = int_or_none(g) print cacher.pid, cach_pid, txt.encode('utf8') found = t2.findall(txt) if found: found_date = found[0] found_date = date_or_none(found_date) if cach_pid: the_log = LogSeekCach( cacher_pid=cacher.pid, cach_pid=cach_pid) the_log.found_date = found_date the_log.grade = grade the_log.save() print 'saved' fh.close() if LOAD_RECOMMEND_CACHE_LOGS: LogRecommendCach.objects.all().delete() cachers = Geocacher.objects.all() t = re.compile('\?pn\=101\&cid=(\d+)') for cacher in cachers: if cacher.uid: print cacher.pid, cacher.uid url = 'http://www.geocaching.su/site/popup/userstat.php?s=3&uid=%s' % cacher.uid yplib.get(url) soup = yplib.soup() tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = found_date = grade = None cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text print cacher.pid, cach_pid, txt.encode('utf8') if cach_pid: the_log = LogRecommendCach( cacher_pid=cacher.pid, cach_pid=cach_pid) the_log.save() print 'saved' if LOAD_PHOTOALBUM_LOGS: LogPhotoAlbum.objects.all().delete() cachers = Geocacher.objects.all() t = re.compile('showmemphotos\.php\?cid=(\d+)') for cacher in cachers: if cacher.uid: print cacher.pid, cacher.uid url = 'http://www.geocaching.su/site/popup/userstat.php?s=4&uid=%s' % cacher.uid yplib.get(url) soup = yplib.soup() tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = found_date = grade = None cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text print cacher.pid, cach_pid, txt.encode('utf8') if cach_pid: the_log = LogPhotoAlbum( cacher_pid=cacher.pid, cach_pid=cach_pid) the_log.save() print 'saved' elapsed = time() - start print "Elapsed time -->", elapsed switch_on_status_updated() log('gcsu_logs', 'OK')
def main(processed_pid): LOAD_GEO_LOCATION = True LOAD_GOOGLE_LOCATION = True start = time() if LOAD_GEO_LOCATION: for thing in Geothing.objects.all().extra(where=["country_code IS NULL OR admin_code IS NULL OR admin_code='777'"]).order_by('pid')[:100]: lat = thing.latitude_degree lng = thing.longitude_degree if lat is not None and lng is not None: cnt = 1 r = 10 admin_code = None while cnt < 2: url = 'http://api.geonames.org/countrySubdivision?username=galdor&lat=%s&lng=%s&lang=en&radius=%d' % (lat, lng, r*cnt) yplib.get(url) try: soup=yplib.soup() except: pass if soup: item = soup.find('countrysubdivision') if item: if soup.admincode1: admin_code = soup.admincode1.text if admin_code: break cnt += 1 item = soup.find('countrycode') if item and item.text: thing.country_code = item.text.encode('utf8') if soup.admincode1: thing.admin_code = soup.admincode1.text item = soup.find('countryname') if item: thing.country_name = item.text if soup.adminname1: thing.oblast_name = soup.adminname1.text if thing.country_code and len(thing.country_code)==2: thing.save() else: print 'no location', thing.pid, lat, lng, thing.location.NS, thing.location.NS_degree, thing.location.NS_minute, thing.location.EW, thing.location.EW_degree, thing.loc_EW_minute if LOAD_GOOGLE_LOCATION: for thing in Geothing.objects.all().extra(where=["country_code IS NULL OR country_name IS NULL OR admin_code IS NULL OR admin_code='777'"]).order_by('pid')[:100]: lat = thing.latitude_degree lng = thing.longitude_degree if lat is not None and lng is not None: admin_name = None country_code = None country_name = None admin_code = None url = 'http://maps.googleapis.com/maps/api/geocode/json?latlng=%s,%s&sensor=false' % (lat, lng) f = urllib2.urlopen(url) data = f.read() try: r = json.loads(data) except Exception as e: print type(e) print e if r.get('status') == 'OK' and len(r.get('results')): for result in r.get('results'): if len(result.get('address_components')): for address in result.get('address_components'): types = address.get("types") if "country" in types and "political" in types: country_code = address.get("short_name") if "administrative_area_level_1" in types and "political" in types: admin_name = address.get("short_name") if len(admin_name) < 6: admin_name = address.get("long_name") if country_code: thing.country_code = country_code thing.oblast = admin_name thing.admin_code = get_admin_code_by_name(country_code, admin_name) thing.save() else: print lat, lng, country_code, country_name, admin_name else: print thing.pid, lat, lng, thing.location.NS, thing.location.NS_degree, thing.location.NS_minute, thing.location.EW, thing.location.EW_degree, thing.loc_EW_minute sql = """ UPDATE geothing gt LEFT JOIN oblast_subject os ON ( gt.country_code=os.country_iso and gt.oblast=os.oblast ) SET gt.admin_code=os.code WHERE os.id IS NOT NULL """ r = execute_query(sql) sql = """ UPDATE geothing SET admin_code='777', oblast_name='undefined subject' WHERE country_code IS NOT NULL AND admin_code IS NULL """ r = execute_query(sql) sql = """ update geothing gt left join geo_country c on gt.country_code=c.iso set gt.country_name=c.name """ r = execute_query(sql) sql = """ update geothing gt left join geo_country_subject c on gt.admin_code=c.code and gt.country_code=c.country_iso set gt.oblast_name=c.name where gt.admin_code='777' """ r = execute_query(sql) sql = """ update geothing set country_code='RU', admin_code='82', country = 'Россия', oblast = 'Республика Крым', country_name = 'Russia', oblast_name = 'Respublika Krym' where country_code='UA' and admin_code='11' """ r = execute_query(sql) sql = """SELECT COUNT(*) FROM geothing WHERE country_code IS NULL""" undefined_country_count = sql2val(sql) sql = """SELECT COUNT(*) FROM geothing WHERE admin_code IS NULL OR admin_code = '777'""" undefined_subject_count = sql2val(sql) undefined_count = '%s/%s' % (undefined_country_count, undefined_subject_count) log('map_set_location', 'OK %s'%undefined_count) elapsed = time() - start print "Elapsed time -->", elapsed
def main(): LOAD_CACHES = True start = time() yplib.setUp() yplib.set_debugging(False) r = yplib.post2('http://www.geocaching.su/?pn=108', (('Log_In', 'Log_In'), ('email', '*****@*****.**'), ('passwd', 'zaebalixakeryvas'), ('longterm', '1'))) soup = yplib.soup() a = soup.find('a', attrs={'class': "profilelink"}, text='galdor') if not a: print 'Authorization failed' return False r = yplib.get('http://www.geocaching.su/site/popup/selex.php') soup = yplib.soup() chbox_list = soup.findAll('input', type='checkbox') regions = [] for chbox in chbox_list: v = chbox.get('value') if v and chbox.get('name', '') == 'point[]': regions.append(v) data = [ ('translit', '0'), ('fmt', 'wpt'), ('code_to_name', '1'), ('finded', '2'), ] for r in regions: data.append(('point[]', r)) r = yplib.post2('http://www.geocaching.su/site/popup/export.php', data) soup = yplib.soup() wpt = soup.text.split('\n') WPT_CODE = 1 WPT_LAT = 2 WPT_LON = 3 WPT_TITLE = 10 WPT_DATE = 4 geosite = Geosite.objects.get(code='GC_SU') print len(wpt), 'points' k = 0 for point in wpt: k += 1 fields = point.split(',') if fields[0].isdigit(): the_geothing = TheGeothing() the_location = TheLocation() lat_degree = float(fields[WPT_LAT]) the_location.NS_degree = lat_degree #the_location.NS_minute = (abs(lat_degree) - abs(the_location.NS_degree)) * 60 lon_degree = float(fields[WPT_LON]) the_location.EW_degree = lon_degree #the_location.EW_minute = (abs(lon_degree) - abs(the_location.EW_degree)) * 60 p = re.compile('(\D+)(\d+)') dgs = p.findall(fields[WPT_CODE]) if dgs: code_data = dgs[0] the_geothing.code = fields[WPT_CODE] the_geothing.pid = int(code_data[1]) the_geothing.type_code = code_data[0] p = re.compile(u'(.+)от(.+)') dgs = p.findall(fields[WPT_TITLE]) if dgs: title = dgs[0] the_geothing.name = title[0] the_geothing.author = title[1] d = float(fields[WPT_DATE]) the_geothing.created_date = Dephi_date_to_python_date(d) if the_geothing.type_code in GEOCACHING_ONMAP_TYPES: geothing = get_object_or_none(Geothing, pid=the_geothing.pid, geosite=geosite) if geothing is not None: update_geothing(geothing, the_geothing, the_location) else: create_new_geothing(the_geothing, the_location, geosite) log('map_gcsu_caches', 'OK') elapsed = time() - start print "Elapsed time -->", elapsed
def main(): LOAD_CACHES = True start = time() yplib.setUp() yplib.set_debugging(False) # log in r = yplib.post2('http://opencaching.pl/login.php', (('LogMeIn', 'zaloguj'), ('email', 'kurianin'), ('password', 'gjhjkjy'), ('action', 'login'), ('target', 'index.php'))) soup = yplib.soup() a = soup.find('a', text='kurianin') if not a: print 'Authorization failed' return False print 'OK' ## search page #r = yplib.get('http://opencaching.pl/search.php') #soup = yplib.soup() # get wpt file r = yplib.get( 'http://opencaching.pl/search.php?searchto=searchbyname&showresult=1&expert=0&output=HTML&sort=bycreated&f_inactive=1&f_ignored=1&f_userfound=1&f_userowner=1&f_watched=0&f_geokret=0&country=PL®ion=&cachetype=1111111110&cache_attribs=&cache_attribs_not=&cachesize_1=1&cachesize_2=1&cachesize_3=1&cachesize_4=1&cachesize_5=1&cachesize_6=1&cachesize_7=1&cachevote_1=-3&cachevote_2=3.000&cachenovote=1&cachedifficulty_1=1&cachedifficulty_2=5&cacheterrain_1=1&cacheterrain_2=5&cacherating=0&cachename=%25&cachename=' ) soup = yplib.soup(cp='utf8') link_to_wpt = '' #the_div = soup.find('div', {'class':"content2-pagetitle"}) wpt_link = re.compile('ocpl\d+\.wpt\?.+count\=max.*') a_list = soup.findAll('a', {'class': "links", 'title': "Oziexplorer .wpt"}) if a_list: for a in a_list: if a.get('href') and wpt_link.match(a.get('href')): link_to_wpt = a.get('href') break print link_to_wpt if link_to_wpt: r = yplib.get(link_to_wpt) soup = yplib.soup(cp='utf8') wpt = soup.text.split('\n') else: print 'oblom' return WPT_CODE = 10 WPT_LAT = 2 WPT_LON = 3 WPT_TITLE = 1 WPT_DATE = 4 MY_CONSUMER_KEY = 'fky3LF9xvWz9y7Gs3tZ6' FIELDS = 'code|name|location|type|status|url|owner|date_created' geocach_api_request = 'http://opencaching.pl/okapi/services/caches/geocache?cache_code=%s&consumer_key=%s&fields=%s' geosite = Geosite.objects.get(code='OCPL') print geosite print len(wpt), 'points' k = 0 uc = 0 nc = 0 for point in wpt: k += 1 fields = point.split(',') if fields[0] == '-1': the_geothing = TheGeothing() the_geothing.pid = 1 the_location = TheLocation() lat_degree = float(fields[WPT_LAT]) the_location.NS_degree = lat_degree #the_location.NS_minute = (abs(lat_degree) - abs(the_location.NS_degree)) * 60 lon_degree = float(fields[WPT_LON]) the_location.EW_degree = lon_degree #the_location.EW_minute = (abs(lon_degree) - abs(the_location.EW_degree)) * 60 code_str = fields[WPT_CODE] parts = code_str.split('/') if len(parts) == 4: cache_code = parts[0] the_geothing.code = cache_code the_geothing.name = fields[WPT_TITLE] geothing_items = Geothing.objects.filter( code=the_geothing.code, geosite=geosite) if geothing_items.count() > 0: geothing = geothing_items[0] if the_geothing.name == geothing.name and not location_was_changed( geothing.location, the_location): continue url = geocach_api_request % (cache_code, MY_CONSUMER_KEY, FIELDS) try: response = urllib2.urlopen(url) json_str = response.read() cache_data = json.loads(json_str) if cache_data.get('status') != 'Available': continue #print cache_data.get('type') the_geothing.type_code = OCPL_TYPES.get( cache_data.get('type')) #print the_geothing.type_code cache_url = cache_data.get('url') if not cache_url: continue p = re.compile(u'OP([\dA-F]+)$') dgs = p.findall(cache_url) the_geothing.pid = int(dgs[0], 16) owner_name = '' if cache_data.get('owner'): owner_name = cache_data.get('owner').get('username') the_geothing.author = owner_name date_created = cache_data.get('date_created') if date_created: date_created = date_created[:10] parts = date_created.split('-') if parts and len(parts) == 3: dt = datetime(int(parts[0]), int(parts[1]), int(parts[2])) the_geothing.created_date = dt except: print print 'exception.' print url print cache_data #break continue if the_geothing.type_code in GEOCACHING_ONMAP_TYPES: geothing = get_object_or_none(Geothing, pid=the_geothing.pid, geosite=geosite) if geothing is not None: update_geothing(geothing, the_geothing, the_location) uc += 1 else: create_new_geothing(the_geothing, the_location, geosite) nc += 1 #break sql = """ select COUNT(*) FROM ( select g.code as code, count(id) as cnt from geothing g group by g.code having cnt > 1 ) as tbl """ dc = sql2val(sql) message = 'OK. updated %s, new %s, doubles %s' % (uc, nc, dc) log('map_ocpl_caches', message) elapsed = time() - start print "Elapsed time -->", elapsed
def main(): #if not switch_off_status_updated(): #return False LOAD_CREATED_CACHE_LOGS = True LOAD_SEEK_CACHE_LOGS = True LOAD_RECOMMEND_CACHE_LOGS = True LOAD_PHOTOALBUM_LOGS = True start = time() yplib.setUp() yplib.set_debugging(False) r = yplib.post2('http://www.geocaching.su/?pn=108', (('Log_In', 'Log_In'), ('email', '*****@*****.**'), ('passwd', 'zaebalixakeryvas'), ('longterm', '1'))) soup = yplib.soup() a = soup.find('a', attrs={'class': "profilelink"}, text='galdor') if not a: print 'Authorization failed' return False print print 'BEGIN' fh = open('cant_open_user_profile.txt', 'w') if LOAD_CREATED_CACHE_LOGS: LogCreateCach.objects.all().update(updated=False) print 'updating of creating logs' cachers = Geocacher.objects.all().values_list('pid', 'uid') t = re.compile('\?pn\=101\&cid=(\d+)') t1 = re.compile(u'создан\s+(\d\d\.\d\d\.\d\d\d\d)') for cacher in cachers: if cacher[1]: url = 'http://www.geocaching.su/site/popup/userstat.php?s=1&uid=%s' % cacher[ 1] try: yplib.get(url) except BrowserStateError: log_error(fh, cacher[1], 'bse') continue soup = yplib.soup() tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = created_date = None coauthor = False cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text if u'(соавтор)' in txt: coauthor = True found = t1.findall(txt) if found: created_date = found[0] created_date = date_or_none(created_date) if cach_pid: print cacher[0], cach_pid, txt.encode('utf8') the_log, created = LogCreateCach.objects.\ get_or_create( author_pid=cacher[0], cach_pid=cach_pid) the_log.created_date = created_date the_log.coauthor = coauthor the_log.updated = True the_log.save() else: log_error(fh, cacher[1], 'npc') LogCreateCach.objects.filter(updated=False).delete() if LOAD_SEEK_CACHE_LOGS: LogSeekCach.objects.all().update(updated=False) cachers = Geocacher.objects.all().values_list( 'pid', 'uid') #.filter(pid=18849) t = re.compile('\?pn\=101\&cid=(\d+)') t1 = re.compile(u'создан\s+(\d\d\.\d\d\.\d\d\d\d)') t2 = re.compile(u'найден\s+(\d\d\.\d\d\.\d\d\d\d)') t3 = re.compile(u'оценен\s+на\s+(\d)') for cacher in cachers: if cacher[1]: url = 'http://www.geocaching.su/site/popup/userstat.php?s=2&uid=%s' % cacher[ 1] try: yplib.get(url) soup = yplib.soup() except BrowserStateError: log_error(fh, cacher[1], 'bse') continue tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = found_date = grade = None cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text found = t3.findall(txt) if found: g = found[0] grade = int_or_none(g) found = t2.findall(txt) if found: found_date = found[0] found_date = date_or_none(found_date) if cach_pid: print cacher[0], cach_pid, txt.encode('utf8') the_log, created = LogSeekCach.objects.\ get_or_create( cacher_pid=cacher[0], cach_pid=cach_pid, ) the_log.found_date = found_date the_log.grade = grade the_log.updated = True the_log.save() else: log_error(fh, cacher[1], 'npf') LogSeekCach.objects.filter(updated=False).delete() if LOAD_RECOMMEND_CACHE_LOGS: LogRecommendCach.objects.all().update(updated=False) cachers = Geocacher.objects.all().values_list('pid', 'uid') t = re.compile('\?pn\=101\&cid=(\d+)') for cacher in cachers: if cacher[1]: url = 'http://www.geocaching.su/site/popup/userstat.php?s=3&uid=%s' % cacher[ 1] try: yplib.get(url) soup = yplib.soup() except BrowserStateError: log_error(fh, cacher[1], 'bse') continue tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = found_date = grade = None cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text if cach_pid: print cacher[0], cach_pid, txt.encode('utf8') the_log, created = LogRecommendCach.\ objects.get_or_create( cacher_pid=cacher[0], cach_pid=cach_pid) the_log.updated = True the_log.save() else: log_error(fh, cacher[1], 'npr') LogRecommendCach.objects.filter(updated=False).delete() if LOAD_PHOTOALBUM_LOGS: LogPhotoAlbum.objects.all().update(updated=False) cachers = Geocacher.objects.all().values_list('pid', 'uid') t = re.compile('showmemphotos\.php\?cid=(\d+)') for cacher in cachers: if cacher[1]: url = 'http://www.geocaching.su/site/popup/userstat.php?s=4&uid=%s' % cacher[ 1] try: yplib.get(url) soup = yplib.soup() except BrowserStateError: log_error(fh, cacher[1], 'bse') continue tbl = soup.find('table', attrs={'class': 'pages'}) if tbl: rows = tbl.findAll('tr') for row in rows: cach_pid = found_date = grade = None cell = row.find('td') if cell: a_list = cell.findAll('a') for a in a_list: cach_pid = None parts = t.findall(a['href']) if len(parts): cach_pid = int(parts[0]) txt = cell.text if cach_pid: print cacher[0], cach_pid, txt.encode('utf8') the_log, created = LogPhotoAlbum.\ objects.get_or_create( cacher_pid=cacher[0], cach_pid=cach_pid) the_log.updated = True the_log.save() else: log_error(fh, cacher[1], 'npp') LogPhotoAlbum.objects.filter(updated=False).delete() elapsed = time() - start print "Elapsed time -->", elapsed #switch_on_status_updated() log('gcsu_logs', 'OK') fh.close()