def cleanUp(): '''Delete existing session and cleanup the database''' global conn print("\nAre you SURE you want to perform a clean up?") print("This will STOP and DELETE any running session.") s = input("Type 'YES' to confirm: ") if (s.strip() == "YES"): # connect to the DB if not already connected connect() print(utils.clean_message()) sql.clean(conn) end_session(2) else: print("\nNo changes made. Exiting.") end_session(2)
def get_links(soup, url, re=r'^.*$'): if not soup: return set([]) return set([ clean(urllib2.urlparse.urljoin(url, a.attrs.get('href'))) for a in soup.findAll('a') ])
def stem(p): # p = re.match(r"^.*?[A-Za-z0-9']") m = re.match(r"^\s*[<(]*(.*?)[.,?!:)>/]*\s*$", p) if m: p = m.groups()[0] p = porter.stem(p, 0, len(p) - 1).upper()[:25] p = clean(p) return p
def crawl(root, regex=r'^.*$', level=1, quiet=False): cursor = db.cursor() RootPageID = select_or_insert(db, 'Webpage', url=root, quiet=quiet)[0]['WID'] CrawlID = insert1(db, 'Crawl', rootwid=RootPageID, nLevels=level)[0]['CID'] soup = mine(root, cid=CrawlID, wid=RootPageID, quiet=quiet, regex=regex) discovered = get_links(soup, root) discovered = filter(lambda link: re.match(regex, link), discovered) discovered_wids = set([]) for url in discovered: curl = clean(url) discovered_wids.add( select_or_insert(db, 'Webpage', url=str(curl))[0]['WID']) discovered_wids = list(discovered_wids) discovered_wids.sort() discovered_wids = [str(wid) for wid in discovered_wids] if len(discovered_wids): query = 'UPDATE Webpage SET newCID={cid} WHERE wid IN ({wids});'.format( cid=CrawlID, wids=','.join(discovered_wids)) if not quiet: print query cursor.execute(query) else: print '-- No links' insert(db, 'Link', [{ 'fromWID': RootPageID, 'toWID': url } for url in discovered_wids]) while level > 0: query = 'SELECT wid, url FROM Webpage WHERE newCID = {cid} AND mined=False;'.format( cid=CrawlID) if not quiet: print query cursor = db.cursor() cursor.execute(query) for wid, url in cursor: # print url mine(url, cid=CrawlID, wid=wid, quiet=quiet, regex=regex) # crawl(url,level-1,CrawlID,quiet=quiet) level -= 1 print '-- Level:', level query = 'UPDATE Crawl SET endtime="{now}" WHERE cid = {cid};'.format( cid=CrawlID, now=datetime.datetime.now()) if not quiet: print query cursor.execute(query) db.close()
def crawl(root,regex=r'^.*$',level=1,quiet=False): RootPageID = Webpage.select_or_insert(url=root)['WID'] CrawlID = Crawl.insert1(rootwid=RootPageID, nLevels=level, access=True)['CID'] # exit() soup = mine(root, cid=CrawlID, wid=RootPageID, quiet=quiet, regex=regex) discovered = get_links(soup, root) discovered = filter(lambda link: re.match(regex, link), discovered) discovered_wids = set([]) for url in discovered: curl = clean(url) discovered_wids.add(Webpage.select_or_insert(url=str(curl))['WID']) discovered_wids = list(discovered_wids) discovered_wids.sort() discovered_wids = [ str(wid) for wid in discovered_wids ] if len(discovered_wids): UpdateWebpage = db.cursor() query = 'UPDATE Webpage SET newCID={cid} WHERE wid IN ({wids});'.format(cid=CrawlID, wids=','.join(discovered_wids)) if not quiet: print query UpdateWebpage.execute(query) UpdateWebpage.close() consolidate_all_webpages() else: print '-- No links' Link.insertlod([ {'fromWID':RootPageID, 'toWID':url} for url in discovered_wids ]) while level > 0: breakpoint(CrawlID) SelectURL = db.cursor() query = 'SELECT wid, url FROM Webpage WHERE newCID = {cid} AND mined=False;'.format(cid=CrawlID) if not quiet: print query SelectURL.execute(query) for wid, url in SelectURL: mine(url, cid=CrawlID, wid=wid, quiet=quiet, regex=regex) breakpoint(CrawlID) level -= 1 print '-- Level:', level SelectURL.close() FinalUpdate = db.cursor() query = 'UPDATE Crawl SET endtime="{now}" WHERE cid = {cid};'.format(cid=CrawlID, now=datetime.datetime.now()) if not quiet: print query FinalUpdate.execute(query) FinalUpdate.close()
def stem(p): # print p m = re.match(r"^.*?([A-Za-z0-9']+).*$", p) # m = re.match(r"^\s*[<(]*(\w*?)[.,?!:)>/]*\s*$", p) if m: # print m.groups() p = m.groups()[0] p = porter.stem(p, 0, len(p)-1).upper() p = clean(p) if '\\' in p or max([ ord(c) for c in p ] + [0]) > 127: return '' # if '.' in p: # return stem(re.match(r"^(.*?)\.", p).groups()[0].lower()) # # return stem(re.match(r"^(.*?)\xe2", p).groups()[0].lower()) # ultraascii = re.match(r"^(.*?)[^[:ascii:]]", p) # if ultraascii: # return stem(ultraascii.groups()[0].lower()) p = p.replace('"','') return p[:25]