def edit_post(self, postid, body, reason=''): if not self.is_logged_in(): raise ValueError("Failure: can't edit post before logging in") postid = str(postid) page = blogotubes('http://www.mersenneforum.org/editpost.php?do=editpost&p='+postid) if username not in page: # Verify cookies installed properly raise ValueError("Failure: tried to edit post {} but not logged in!".format(postid)) stoken, phash, ptime = self.parse_tokens(page) data = self.fill_form(body, postid, stoken, phash, ptime, reason) page = blogotubes('http://www.mersenneforum.org/editpost.php?do=updatepost&p='+postid, data=data) # Ignore response until I know what to check for return page
def parse_text_file(reservee, url): global email_msg old = {seq.seq for seq in db.values() if seq.res == reservee} txt = blogotubes(url) current = set() for line in txt.splitlines(): if re.match(r'(?<![0-9])[0-9]{5,6}(?![0-9])', line): seq = int(line) if seq in current: string = "Duplicate sequence? {} {}".format(seq, url) Print(string) email_msg += string+'\n' else: current.add(seq) elif not re.match(r'^[0-9]+$', line): string = "Unknown line from {}: {}".format(url, line) Print(string) email_msg += string+'\n' # easy peasy lemon squeezy done = old - current new = current - old if done or new: spider_msg.append('{}: Add {}, Drop {}'.format(reservee, len(new), len(done))) drop_db(db, reservee, done) add_db(db, reservee, new)
def get_reservations(pid): # Copied from allseq.py page = blogotubes(reservation_page + '?p='+str(pid)) # Isolate the [code] block with the reservations page = re.search(r'<pre.*?>(.*?)</pre>', page, flags=re.DOTALL).group(1) ind = page.find('\n') if ind == -1: # No newline means only "<b>Seq Who Index Size</b>", i.e. empty, so no reservations return "" else: return page[ind+1:] # Dump the first line == "<b>Seq Who Index Size</b>"
def login(self): data = {'vb_login_username': username, 'vb_login_password': passwd} data['s'] = '' data['securitytoken'] = 'guest' data['do'] = 'login' data['vb_login_md5password'] = '' data['vb_login_md5password_utf'] = '' data['cookieuser'] = '******' page = blogotubes('http://www.mersenneforum.org/login.php?do=login', data=data) return username in page
def get_data(): global data_file if "http" in data_file: print("Getting the current data") txt = blogotubes(data_file) if txt is None: raise ValueError("Couldn't get data file") else: data_file = "AllSeq.json" with open(data_file, "w") as f: f.write(txt)
def get_id_info(id): base = "http://factordb.com/index.php?id=" page = blogotubes(base + str(id)) if not page: # or 'FF' in page: raise ValueError("http error") smalls = smallfact.findall(page) larges = largefact.findall(page) comps = composite.findall(page) # print(compid, "\n{}\n##########################################\n\n{}".format(smalls, page)) # apply map(get_num, ...) to the first entry of the tuples, then concatenate the result with the second entry larges = [num + exp for num, exp in zip(map(get_num, (l[0] for l in larges)), (l[1] for l in larges))] comps = { int(num): (int(exp[1:]) if exp else 1) for num, exp in zip(map(get_num, (c[0] for c in comps)), (c[1] for c in comps)) } # comp = get_num(compid) return nt.Factors(" * ".join(smalls + larges)), comps
def spider(last_pid): wobsite = 'http://www.mersenneforum.org/showthread.php?t=11588&page=' backup() db = read_db() spider_msg = [] ############################################################################################### # First the standalone func that processes mass text file reservations def parse_text_file(reservee, url): global email_msg old = {seq.seq for seq in db.values() if seq.res == reservee} txt = blogotubes(url) current = set() for line in txt.splitlines(): if re.match(r'(?<![0-9])[0-9]{5,6}(?![0-9])', line): seq = int(line) if seq in current: string = "Duplicate sequence? {} {}".format(seq, url) Print(string) email_msg += string+'\n' else: current.add(seq) elif not re.match(r'^[0-9]+$', line): string = "Unknown line from {}: {}".format(url, line) Print(string) email_msg += string+'\n' # easy peasy lemon squeezy done = old - current new = current - old if done or new: spider_msg.append('{}: Add {}, Drop {}'.format(reservee, len(new), len(done))) drop_db(db, reservee, done) add_db(db, reservee, new) ############################################################################################### # This processes the parsed HTML and its add/drop commands, and actually affects the current reservations def process_msg(pid, name, msg): add = []; addkws = ('Reserv', 'reserv', 'Add', 'add', 'Tak', 'tak') drop = []; dropkws = ('Unreserv', 'unreserv', 'Drop', 'drop', 'Releas', 'releas') for line in msg.splitlines(): if any(kw in line for kw in dropkws): for s in re.findall(r'(?<![0-9])[0-9]{5,6}(?![0-9])', line): # matches only 5/6 digit numbers drop.append(int(s)) elif any(kw in line for kw in addkws): for s in re.findall(r'(?<![0-9])[0-9]{5,6}(?![0-9])', line): add.append(int(s)) la = len(add) ld = len(drop) if la or ld: Print('{}: {} adding {}, dropping {}'.format(pid, name, repr(add), repr(drop))) spider_msg.append('{}: Add {}, Drop {}'.format(name, la, ld)) add_db(db, name, add) drop_db(db, name, drop) ############################################################################################### # Begin the parsers, converts the various HTML into Python data structures for processing # Also reverse stack order # For each page of the thread, the parsers return a list of (post_id, author, html-replaced-post_body) # All of my previous html parsing needs have been simple enough that regexs were sufficient, # and a proper parser would have been overkill; this, though, is much closer to the border, and if I # already knew how to use any parser, I would. But the overhead is too much to start now, so... # thankfully there are comments in the html that are individually closed; without that, # this would be substantially harder and I'd probably resort to a parser. def parse_msg(msg): # Drop text after the last </div> ind = msg.rfind('</div>') msg = msg[:ind] if msg.count('<div') > 1: # There are quotes in the message # drop text before the second to last </div> ind = msg.rfind('</div>') msg = msg[ind+6:] else: # drop text after the first tag ind = msg.find('>') msg = msg[ind+1:] return msg.replace('<br />', '').strip() def parse_post(post): name = re.search(r'''alt="(.*?) is o''', post).group(1) # "is offline" or "is online" msg = re.search(r'<!-- message -->(.*?)<!-- / message -->', post, re.DOTALL).group(1) return name, parse_msg(msg) def parse_page(page): out = [] posts = re.findall(r'<!-- post #([0-9]{6,7}) -->(.*?)<!-- / post #\1 -->', page, re.DOTALL) for post in posts: #name, msg = parse_post(post[1]) out.append( (int(post[0]),) + parse_post(post[1]) ) return out ################################################################################################# # End parsers, first one tiny helper function def order_posts(posts): if posts != sorted(posts, key=lambda post: post[0]): raise ValueError("Out of order posts! Pids:\n{}".format([post[0] for post in posts])) return posts[0][0] ################################################################################################# # Now begin actual logic of top-level spider() html = blogotubes(wobsite+'10000') # vBulletin rounds to last page all_pages = [parse_page(html)] lowest_pid = order_posts(all_pages[0]) if not last_pid: # If this is the first time running the script last_pid = lowest_pid # On first time run, ignore all but the last page while lowest_pid > last_pid: # It's probable that we missed some posts on previous page page_num = re.search('<td class="vbmenu_control" style="font-weight:normal">Page ([0-9]+)', html).group(1) page_num = str(int(page_num)-1) Print("Looks like posts were missed, checking page", page_num) html = blogotubes(wobsite+page_num) all_pages.insert(0, parse_page(html)) lowest_pid = order_posts(all_pages[0]) all_posts = [post for page in all_pages for post in page if post[0] > last_pid] if all_posts: order_posts(all_posts) # Assert order, ignore lowest pid retval for post in all_posts: process_msg(*post) last_pid = all_posts[-1][0] # Highest PID processed else: Print("No new posts!") for reservee, url in txtfiles.items(): parse_text_file(reservee, url) if spider_msg: write_db(db) update() if not use_local_reservations: send('Spider: ' + ' | '.join(spider_msg)) # For now, doesn't check if send was successful return last_pid
############################################################################### import re from time import time from _import_hack import add_path_relative_to_script add_path_relative_to_script('..') # this should be removed when proper pip installation is supported from mfaliquot.sequence import Sequence from mfaliquot.myutils import linecount, Print, strftime, blogotubes, add_cookies, email # Some slight modifications of the default global variables if 'http' in info: txt = blogotubes(info) if txt is None: Print("Couldn't get info, no info will be updated") info = None else: info = dir+'/AllSeq.txt' with open(info, 'w') as f: f.write(txt) def get_reservations(pid): # Copied from allseq.py page = blogotubes(reservation_page + '?p='+str(pid)) # Isolate the [code] block with the reservations page = re.search(r'<pre.*?>(.*?)</pre>', page, flags=re.DOTALL).group(1) ind = page.find('\n') if ind == -1: # No newline means only "<b>Seq Who Index Size</b>", i.e. empty, so no reservations
def get_num(id): page = blogotubes("http://factordb.com/index.php?showid=" + id) num = largedigits.search(page).group(1) num = re.sub(r"[^0-9]", "", num) return num