def 開窗來跑(我): ''' 目前只准跑 翻譯後的 中文程式, 並讓人隨意修改之, 原版英文程式不准改, 因為那是系統的一部份。 本功能基本上可跑, 但在「關」窗時, 會出現一些 「髒」, 仍待改。 ''' import sys ''' import idlelib from idlelib import PyShell PyShell= importlib.reload(idlelib.PyShell) ''' #import idlelib.ryPyShell as PyShell import idlelib.PyShell as PyShell fn= 我.module.__file__ #print('fn= ', fn) tcDir= 我.tcDir #= 'tcDir' if os.path.abspath(os.path.dirname(fn)) != os.path.abspath(tcDir): msg= '%s not in %s\n'%(fn, os.path.abspath(tcDir)) msg+= '目前只准跑 翻譯後的 中文程式,並隨意修改之。\n' print(msg) return sys.argv= ['', '-r', fn] #我.root.wm_protocol("WM_DELETE_WINDOW", 我._destroy) PyShell.main() #### 這行 在 windows 上 OK; 但 mac 不 OK !!!!
def 開窗來跑(我): ''' 目前只准跑 翻譯後的 中文程式, 並讓人隨意修改之, 原版英文程式不准改, 因為那是系統的一部份。 本功能基本上可跑, 但在「關」窗時, 會出現一些 「髒」, 仍待改。 ''' import sys ''' import idlelib from idlelib import PyShell PyShell= importlib.reload(idlelib.PyShell) ''' #import idlelib.ryPyShell as PyShell import idlelib.PyShell as PyShell fn = 我.module.__file__ #print('fn= ', fn) tcDir = 我.tcDir #= 'tcDir' if os.path.abspath(os.path.dirname(fn)) != os.path.abspath(tcDir): msg = '%s not in %s\n' % (fn, os.path.abspath(tcDir)) msg += '目前只准跑 翻譯後的 中文程式,並隨意修改之。\n' print(msg) return sys.argv = ['', '-r', fn] #我.root.wm_protocol("WM_DELETE_WINDOW", 我._destroy) PyShell.main() #### 這行 在 windows 上 OK; 但 mac 不 OK !!!!
#!/usr/bin/python try: import idlelib.PyShell except ImportError: # IDLE is not installed, but maybe PyShell is on sys.path: try: import PyShell except ImportError: raise else: import os idledir = os.path.dirname(os.path.abspath(PyShell.__file__)) if idledir != os.getcwd(): # We're not in the IDLE directory, help the subprocess find run.py pypath = os.environ.get('PYTHONPATH', '') if pypath: os.environ['PYTHONPATH'] = pypath + ':' + idledir else: os.environ['PYTHONPATH'] = idledir PyShell.main() else: idlelib.PyShell.main()
def crawl(): global lis,bis,data1,data2,url,ratings,reviews,driver for uri in start_url: url_q.enqueue(uri.strip()) driver,wait=define_driver() print "\n*****************Starting the CRAWL*********************************\n" while not url_q.isEmpty(): url=url_q.dequeue() ##Going to the Reviews Part of the page driver=driver_get(driver,url+'#reviews') print "\n************Waiting for the reviews page to load***********\n" while True: try: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#reviews > div.s-title.mbot0.borderless'))) break except: print "......Network Error....." print "\n************Reviews Page LOADED********************\n" ##getting the user_id print "\n***************Fetching the user_ID********************\n" try: user_id=driver.find_element_by_xpath('//div[contains(@class,"follow")]').get_attribute('data-user-id') except: user_id=-1 try: no_of_reviews=driver.find_element_by_xpath('//a[@data-tab="reviews"]').text no_of_reviews = re.findall('\d+',no_of_reviews) no_of_reviews = int(no_of_reviews[0]) except: no_of_reviews=0 if user_id in users_crawled or user_id==-1: print "\n__________User already CRAWLED________________\n" continue try: if driver.find_element_by_xpath('//div[contains(@class,"usr-location")]').text.strip()!='Kolkata': continue except: pass if no_of_reviews!=0: print "\n__________________New USER... Starting the crawl__________________\n" #Getting and Clicking the LOAD MORE button print "\n**********Clicking the LOAD_MORE button***********\n" try: load_more = driver.find_element_by_class_name('load-more') while True: try: s=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'load-more'))) load_more.click() time.sleep(2) except Exception,e: print "E1: ",str(e) break except Exception,e: print "E2 :",str(e) print "\n************ALL data LOADED****************\n" ##Getting the reviews DIV block print "\n********Wait while we fetch Reviews and other data**********\n" try: elem=driver.find_elements_by_xpath('//*[@id="reviewFeed"]/div') except Exception,e: print str(e) ##Getting the total review blocks g=elem[0].find_elements_by_xpath("//div[contains(@class,'rev-text')]") ##Getting the reviews and ratings ratings = [] reviews = [] for block in g: rating = block.find_element_by_tag_name('div').get_attribute('aria-label') review = block.text if rating!=None: rating = rating.strip() if review not in reviews and review!='' and review!=' ': reviews.append(review) ratings.append(rating) ##Getting ReviewId,RestaurantId,RestaurantName ##RestaurantAddress and datetime lis = [] bis = [] for block in elem: rev_id = block.get_attribute('data-review_id') res_id = block.find_element_by_class_name('snippet__name').find_element_by_class_name('snippet__link').get_attribute('data-entity_id') res_name = block.find_element_by_class_name('snippet__name').text res_addr = block.find_element_by_class_name('snippet__location').text datetime = block.find_element_by_tag_name('time').get_attribute('datetime') if (rev_id,res_id) not in lis: lis.append([rev_id,res_id]) bis.append([res_name,res_addr,datetime]) data1=[] data2=[] for i in xrange(len(lis)): if lis[i] not in data1: data1.append(lis[i]) data2.append(bis[i]) ##Getting other necessary details # no_of_reviews=driver.find_element_by_xpath('//a[@data-tab="reviews"]').text # no_of_reviews = re.findall('\d+',no_of_reviews) # no_of_reviews = int(no_of_reviews[0]) # user_id=driver.find_element_by_xpath('//div[contains(@class,"follow")]').get_attribute('data-user-id') user_link = url user_name = driver.find_element_by_class_name('full-name').text print no_of_reviews,len(data1),len(ratings),len(reviews) print "\n********ALL data for %s fetched**************\n"%user_name ## Pause for user intervention if the no. of reviews does not equal the list length if no_of_reviews!=len(data1) or no_of_reviews!=len(ratings) or no_of_reviews!=len(reviews): pyshel.main() print "\n**********Writing %s's data to the file************\n"%user_name with open(r'..\dust_bin\user_data.csv','ab') as c: f=csv.writer(c) f.writerow([user_id,user_name,user_link,no_of_reviews]) with open(r'..\dust_bin\review_data.csv','ab') as c: f=csv.writer(c) for i in xrange(len(data1)): f.writerow([user_id]+data1[i]+map(lambda x:x.encode('utf-8'),data2[i])+[reviews[i].encode('utf-8')]+[ratings[i]]) print "\n**********Data Written to file************\n" ##Addding the crawled user users_crawled.append(user_id) print "\n************ User %s crawled **************\n"%user_name
def crawl(): global lis, bis, data1, data2, url, ratings, reviews, driver for uri in start_url: url_q.enqueue(uri.strip()) driver, wait = define_driver() print "\n*****************Starting the CRAWL*********************************\n" while not url_q.isEmpty(): url = url_q.dequeue() ##Going to the Reviews Part of the page driver = driver_get(driver, url + '#reviews') print "\n************Waiting for the reviews page to load***********\n" while True: try: wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#reviews > div.s-title.mbot0.borderless'))) break except: print "......Network Error....." print "\n************Reviews Page LOADED********************\n" ##getting the user_id print "\n***************Fetching the user_ID********************\n" try: user_id = driver.find_element_by_xpath( '//div[contains(@class,"follow")]').get_attribute( 'data-user-id') except: user_id = -1 try: no_of_reviews = driver.find_element_by_xpath( '//a[@data-tab="reviews"]').text no_of_reviews = re.findall('\d+', no_of_reviews) no_of_reviews = int(no_of_reviews[0]) except: no_of_reviews = 0 if user_id in users_crawled or user_id == -1: print "\n__________User already CRAWLED________________\n" continue try: if driver.find_element_by_xpath( '//div[contains(@class,"usr-location")]').text.strip( ) != 'Kolkata': continue except: pass if no_of_reviews != 0: print "\n__________________New USER... Starting the crawl__________________\n" #Getting and Clicking the LOAD MORE button print "\n**********Clicking the LOAD_MORE button***********\n" try: load_more = driver.find_element_by_class_name('load-more') while True: try: s = wait.until( EC.element_to_be_clickable( (By.CLASS_NAME, 'load-more'))) load_more.click() time.sleep(2) except Exception, e: print "E1: ", str(e) break except Exception, e: print "E2 :", str(e) print "\n************ALL data LOADED****************\n" ##Getting the reviews DIV block print "\n********Wait while we fetch Reviews and other data**********\n" try: elem = driver.find_elements_by_xpath( '//*[@id="reviewFeed"]/div') except Exception, e: print str(e) ##Getting the total review blocks g = elem[0].find_elements_by_xpath( "//div[contains(@class,'rev-text')]") ##Getting the reviews and ratings ratings = [] reviews = [] for block in g: rating = block.find_element_by_tag_name('div').get_attribute( 'aria-label') review = block.text if rating != None: rating = rating.strip() if review not in reviews and review != '' and review != ' ': reviews.append(review) ratings.append(rating) ##Getting ReviewId,RestaurantId,RestaurantName ##RestaurantAddress and datetime lis = [] bis = [] for block in elem: rev_id = block.get_attribute('data-review_id') res_id = block.find_element_by_class_name( 'snippet__name').find_element_by_class_name( 'snippet__link').get_attribute('data-entity_id') res_name = block.find_element_by_class_name( 'snippet__name').text res_addr = block.find_element_by_class_name( 'snippet__location').text datetime = block.find_element_by_tag_name( 'time').get_attribute('datetime') if (rev_id, res_id) not in lis: lis.append([rev_id, res_id]) bis.append([res_name, res_addr, datetime]) data1 = [] data2 = [] for i in xrange(len(lis)): if lis[i] not in data1: data1.append(lis[i]) data2.append(bis[i]) ##Getting other necessary details # no_of_reviews=driver.find_element_by_xpath('//a[@data-tab="reviews"]').text # no_of_reviews = re.findall('\d+',no_of_reviews) # no_of_reviews = int(no_of_reviews[0]) # user_id=driver.find_element_by_xpath('//div[contains(@class,"follow")]').get_attribute('data-user-id') user_link = url user_name = driver.find_element_by_class_name('full-name').text print no_of_reviews, len(data1), len(ratings), len(reviews) print "\n********ALL data for %s fetched**************\n" % user_name ## Pause for user intervention if the no. of reviews does not equal the list length if no_of_reviews != len(data1) or no_of_reviews != len( ratings) or no_of_reviews != len(reviews): pyshel.main() print "\n**********Writing %s's data to the file************\n" % user_name with open(r'..\dust_bin\user_data.csv', 'ab') as c: f = csv.writer(c) f.writerow([user_id, user_name, user_link, no_of_reviews]) with open(r'..\dust_bin\review_data.csv', 'ab') as c: f = csv.writer(c) for i in xrange(len(data1)): f.writerow([user_id] + data1[i] + map(lambda x: x.encode('utf-8'), data2[i]) + [reviews[i].encode('utf-8')] + [ratings[i]]) print "\n**********Data Written to file************\n" ##Addding the crawled user users_crawled.append(user_id) print "\n************ User %s crawled **************\n" % user_name
import os import sys if sys.version_info[0] == 2: from idlelib import PyShell as pyshell from idlelib.configHandler import idleConf, IdleUserConfParser else: from idlelib import pyshell from idlelib.config import idleConf, IdleUserConfParser # Change to use our own user configs, instead the one in home dir this_dir = os.path.dirname(__file__) for cfg_type in idleConf.config_types: cfg_path = os.path.join(this_dir, "config-%s.cfg" % cfg_type) idleConf.userCfg[cfg_type] = IdleUserConfParser(cfg_path) idleConf.userCfg[cfg_type].Load() # Back on track pyshell.main()