def scrape_photo_links(self, number, is_hashtag=False): EnvPrint.log_info("Scraping photo links...") encased_photo_links = re.finditer(r'src="([https]+:...[\/\w \.-]*..[\/\w \.-]*' r'..[\/\w \.-]*..[\/\w \.-].jpg)', self._driver.page_source) photo_links = [m.group(1) for m in encased_photo_links] EnvPrint.log_info(photo_links,"pprint")
def logoutAndLogin(self): self._driver.get(urljoin(self.setting['FACEBOOK_DOMAIN'], "accounts/logout")) self._driver.get(urljoin(self.setting['FACEBOOK_DOMAIN'], "accounts/login/")) EnvPrint.log_info("Since Instagram provides 5000 post views per Hour, relogin with annother username and password loaded from {}".format(authentication)) # Input username try: username_input = WebDriverWait(self._driver, 5).until( EC.presence_of_element_located((By.NAME, 'email')) ) username_input.send_keys(self.auth_dict["FACEBOOK"][self.accountIdx]['username']) except Exception: self._driver.save_screenshot('img/{}'.format('screenshot_relogin_01.png')) # Input password try: password_input = WebDriverWait(self._driver, 5).until( EC.presence_of_element_located((By.NAME, 'pass')) ) password_input.send_keys(self.auth_dict["FACEBOOK"][self.accountIdx]['password']) # Submit password_input.submit() except Exception: self._driver.save_screenshot('img/{}'.format('screenshot_relogin_02.png')) WebDriverWait(self._driver, 60).until( EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE)) )
async def savePost(new_list): ignore_num = 0 for i in range(0, len(new_list)): ahref_arr = new_list[i].find_elements_by_xpath( ".//a")[0].get_attribute("href").split('/') id = ahref_arr[len(ahref_arr) - 2] img_src = new_list[i].find_elements_by_xpath( ".//img[@class='_2di5p']")[0].get_attribute("src") text = new_list[i].find_elements_by_xpath( ".//img[@class='_2di5p']")[0].get_attribute("alt") reg_date = datetime.datetime.now() write_date = None try: response = requests.head(img_src, timeout=1) write_date = response.headers["last-modified"] except requests.exceptions.Timeout: write_date = "" finally: if text: exist_ids = None with open(self.csv_file_loc) as f: csvreader = csv.reader(f) exist_ids = [row[0] for row in csvreader] if id in exist_ids: ignore_num = ignore_num + 1 else: with open(self.csv_file_loc, 'a') as file: # file.write("{},{},{},{},{},{}\n".format(id, img_src, text, self.query, write_date, reg_date)) csvwriter = csv.writer(file) csvwriter.writerow([ id, img_src, text, self.query, write_date, reg_date ]) text_enc = text.encode('utf-8') EnvPrint.log_info( { "id": id, "img": img_src, "text": text_enc, "has_tag": self.query, "write_date": write_date, "reg_date": reg_date }, "debug") last_post_num_new = len(new_list) - ignore_num return last_post_num_new
def main(): # Arguments # parser = argparse.ArgumentParser(description='Pengtai Instagram Crawler') parser.add_argument('-d', '--dir_prefix', type=str, default='./data/', help='directory to save results') parser.add_argument('-q', '--query', type=str, help="target to crawl, add '#' for hashtags") parser.add_argument('-t', '--crawl_type', type=str, default='all', help="Options: 'all' | 'tags' | 'photos' | 'following'") parser.add_argument('-n', '--number', type=int, default=0, help='Number of posts to download: integer') parser.add_argument('-l', '--headless', action='store_true', help='If set, will use PhantomJS driver to run script as headless') parser.add_argument('-a', '--authentication', type=str, default='auth.json', help='path to authentication json file') parser.add_argument('-s', '--setting', type=str, default='settings.json', help='path to setting json file') parser.add_argument('-e', '--env', type=str, default='pro', help="environment options: 'pro' | 'dev' | 'test'") parser.add_argument('-r', '--random', action='store_true', help='enables tags mode with random hashtags @ setting.json') args = parser.parse_args() # End Argparse # nowDate = now.strftime("%Y%m%d") filename = './logs/log-'+args.env+'.'+nowDate+'.log' FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.env == "pro": logging.basicConfig(filename=filename, level=logging.INFO, format=FORMAT) elif args.env == "dev": logging.basicConfig(filename=filename,level=logging.DEBUG) root = logging.getLogger() ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) formatter = logging.Formatter(FORMAT) ch.setFormatter(formatter) root.addHandler(ch) EnvPrint.env = args.env EnvPrint.log_info("=========================================") EnvPrint.log_info(args) crawler = FacebookCrawler(headless=args.headless, setting_path = args.setting) crawler.crawl(dir_prefix=args.dir_prefix, query=args.query, crawl_type=args.crawl_type, number=args.number, authentication=args.authentication, is_random=args.random)
def __init__(self, headless=True, setting_path='settings.json'): # Setting with open(setting_path) as data_file: self.setting = json.load(data_file) if headless: EnvPrint.log_info("headless mode on") self._driver = webdriver.PhantomJS(self.setting['PHANTOMJS_PATH']) self._driver.set_window_size(1120, 550) else: self._driver = webdriver.Firefox() self._driver.implicitly_wait(10) self.data = defaultdict(list)
def __init__(self, headless=True, setting_path='settings.json'): # Setting with open(setting_path) as data_file: self.setting = json.load(data_file) if headless: EnvPrint.log_info("headless mode on") self._driver = webdriver.PhantomJS( "{}/node_modules/phantomjs/bin/phantomjs".format( os.path.dirname(os.path.abspath(__file__)))) self._driver.set_window_size(1120, 550) else: self._driver = webdriver.Firefox() self._driver.implicitly_wait(10) self.data = defaultdict(list)
def crawl(self, csv_file_loc, query, crawl_type, number, authentication, is_random): EnvPrint.log_info( "crawl_type: {}, number: {}, authentication: {}, is_random: {}". format(crawl_type, number, authentication, is_random)) # !! CHANGE FROM DB CONNECTION TO FILE SYSTEM !! self.csv_file_loc = csv_file_loc self.crawl_type = crawl_type self.is_random = is_random if self.crawl_type == "tags": if is_random: self.query = random.choice(self.setting["HASHTAGS"]) else: self.query = query self.crawl_type = crawl_type self.accountIdx = 0 self.totalNum = number self.refresh_idx = 0 self.login(authentication) self.browse_target_page() try: self.scrape_tags(number) except Exception: EnvPrint.log_info("Quitting driver...") self.quit() else: self.accountIdx = 0 self.totalNum = number self.refresh_idx = 0 self.login(authentication) self.browse_target_page() try: self.scrape_tags(number) except Exception: EnvPrint.log_info("Quitting driver...") self.quit() # EnvPrint.log_info("Unknown crawl type: {}".format(crawl_type)) # self.quit() # return #Quit driver EnvPrint.log_info("Quitting driver...") self.quit()
def download_and_save(self, dir_prefix, query, crawl_type): # Check if is hashtag dir_name = query.lstrip( '#') + '.hashtag' if query.startswith('#') else query dir_path = os.path.join(dir_prefix, dir_name) if not os.path.exists(dir_path): os.makedirs(dir_path) EnvPrint.log_info("Saving to directory: {}".format(dir_path)) # Save Photos for idx, photo_link in enumerate(self.data['photo_links'], 0): sys.stdout.write("\033[F") EnvPrint.log_info("Downloading {} images to ".format(idx + 1)) # Filename _, ext = os.path.splitext(photo_link) filename = str(idx) + ext filepath = os.path.join(dir_path, filename) # Send image request urlretrieve(photo_link, filepath) # Save Captions for idx, caption in enumerate(self.data['captions'], 0): filename = str(idx) + '.txt' filepath = os.path.join(dir_path, filename) with codecs.open(filepath, 'w', encoding='utf-8') as fout: fout.write(caption + '\n') # Save followers/following filename = crawl_type + '.txt' filepath = os.path.join(dir_path, filename) if len(self.data[crawl_type]): with codecs.open(filepath, 'w', encoding='utf-8') as fout: for fol in self.data[crawl_type]: fout.write(fol + '\n')
def __init__(self, headless=True, setting_path='settings.json'): # Setting with open(setting_path) as data_file: self.setting = json.load(data_file) if headless: EnvPrint.log_info("headless mode on") self._driver = webdriver.PhantomJS(self.setting['PHANTOMJS_PATH']) self._driver.set_window_size(1120, 550) else: self._driver = webdriver.Firefox() self._driver.implicitly_wait(10) self.data = defaultdict(list) # DB connection connection = pymongo.MongoClient(self.setting['DB_HOST'], self.setting['DB_PORT']) db_name = self.setting['DB_NAME'] self.db = connection[db_name] collectionName = "fb-explore-{}-Collection".format(now.strftime("%Y-%m-%d")) self.collection = self.db[collectionName]
async def loop_func(last_post_num, load_idx, loop): last_post_num_pre = last_post_num load_idx = load_idx while last_post_num_pre <= number: self._driver.execute_script(SCROLL_DOWN) time.sleep(0.2) # explore_main_list_new = await get_new_posts() try: WebDriverWait(self._driver, 3).until( EC.presence_of_element_located(( By.XPATH, "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]" ))) explore_main_list_new = self._driver.find_elements_by_xpath( "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]" ) if last_post_num_pre >= len(explore_main_list_new): continue last_post_num_new = await savePost(explore_main_list_new) load_idx = load_idx + 1 cur_post_count = last_post_num_pre + last_post_num_new if self.crawl_type == "tags": EnvPrint.log_info( "current post count : {}, tags : {} ---------------------------------" .format(cur_post_count, self.query)) else: EnvPrint.log_info( "current post count : {} ---------------------------------" .format(cur_post_count)) EnvPrint.log_info( "post crawling done ------------------------------------------", "debug") last_post_num_pre = cur_post_count # await deletePost(last_post_num_new) except Exception: self._driver.save_screenshot( 'img/{}'.format('screenshot_post_error.png')) # error_box = self._driver.find_elements_by_xpath("//div[contains(@class, '_fb78b')]") # if last_post_num_new == 0: # self.leftover_num = number - last_post_num # raise Exception("error") loop.stop()
def crawl(self, dir_prefix, query, crawl_type, number, authentication, is_random): EnvPrint.log_info("crawl_type: {}, number: {}, authentication: {}, is_random: {}" .format(crawl_type, number, authentication, is_random)) self.crawl_type = crawl_type self.is_random = is_random if self.crawl_type == "tags": if is_random: self.query = random.choice(self.setting["HASHTAGS"]) else: self.query = query self.crawl_type = crawl_type self.accountIdx = 0 self.totalNum = number self.refresh_idx = 0 self.login(authentication) self.browse_target_page() try: self.scrape_tags(number) except Exception: EnvPrint.log_info("Quitting driver...") self.quit() else: self.accountIdx = 0 self.totalNum = number self.refresh_idx = 0 self.login(authentication) try: self.scrape_tags(number) except Exception: EnvPrint.log_info("Quitting driver...") self.quit() # Quit driver EnvPrint.log_info("Quitting driver...") self.quit()
def login(self, authentication=None): """ authentication: path to authentication json file """ self._driver.get( urljoin(self.setting['INSTA_DOMAIN'], "accounts/login/")) if authentication: EnvPrint.log_info( "Username and password loaded from {}".format(authentication)) # print("Username and password loaded from {}".format(authentication)) with open(authentication, 'r') as fin: self.auth_dict = json.loads(fin.read()) # Input username try: username_input = WebDriverWait(self._driver, 5).until( EC.presence_of_element_located((By.NAME, 'username'))) username_input.send_keys( self.auth_dict["INSTAGRAM"][self.accountIdx]['username']) except Exception: self._driver.save_screenshot( 'img/{}'.format('screenshot_login_01.png')) # Input password try: password_input = WebDriverWait(self._driver, 5).until( EC.presence_of_element_located((By.NAME, 'password'))) password_input.send_keys( self.auth_dict["INSTAGRAM"][self.accountIdx]['password']) # Submit password_input.submit() except Exception: self._driver.save_screenshot( 'img/{}'.format('screenshot_login_02.png')) else: EnvPrint.log_info( "Type your username and password by hand to login!") EnvPrint.log_info("You have a minute to do so!") WebDriverWait(self._driver, 60).until( EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE)))
def scrape_tags(self, number): """ scrape_tags method : scraping Instagram image URL & tags """ post_num = 0 while post_num < number: try: self._driver.execute_script(SCROLL_DOWN) time.sleep(0.2) self._driver.execute_script(SCROLL_UP) time.sleep(0.2) main_post = self._driver.find_elements_by_xpath("//div[contains(@class, '_4ikz')]") org_post = main_post[0] post = main_post[0] # post = main_post[post_num] while len(post.find_elements_by_xpath(".//div[contains(@class, '_5pcr') and contains(@class,'fbUserStory')]")): post = post.find_elements_by_xpath(".//div[contains(@class, '_5pcr') and contains(@class,'fbUserStory')]")[0] see_more_link = post.find_elements_by_xpath(".//a[contains(@class, 'see_more_link')]") id = "" post_type = "" post_id = "" if see_more_link : link_data = see_more_link[0].get_attribute("href") if link_data != "#": link_data = link_data.split('?')[0] link_data = link_data.replace("https://www.facebook.com/","") link_data = link_data.split('/') id = link_data[0] post_type = link_data[1] post_id = link_data[2] write_utime_ele = post.find_elements_by_xpath(".//abbr[contains(@class, '_5ptz') and contains(@class, 'timestamp')]") write_date = "" write_utime = "" if write_utime_ele: write_utime = write_utime_ele[0].get_attribute("data-utime") write_utime = int(write_utime) write_date = datetime.utcfromtimestamp(write_utime).isoformat() time_atag_href = write_utime_ele[0].find_elements_by_xpath("..")[0].get_attribute("href") link_data = time_atag_href.replace("https://www.facebook.com/","") # link_data = time_atag_href[1:].split('/') link_data = link_data.split('/') if(link_data[0] == "groups"): id = link_data[1] post_type = link_data[0] post_id = link_data[2]+'/'+link_data[3] else: id = link_data[0] post_type = link_data[1] post_id = link_data[2] text = post.find_elements_by_xpath(".//div[contains(@class, '_5pbx') and contains(@class, 'userContent')]") if text: text = text[0].get_attribute("innerHTML") cleanr = re.compile('<.*?>') text = re.sub(cleanr, '', text) else: text = "" img_src_arr = post.find_elements_by_xpath(".//div[contains(@class, '_1dwg') and contains(@class, '_1w_m')]//div[contains(@class, '_3x-2')]//img[@src]") img_src = "" if img_src_arr: img_src = img_src_arr[0].get_attribute("src") if self.collection.find({ "id":id, "post_type":post_type, "post_id":post_id, "write_utime":write_utime }).count() == 0: reg_date = datetime.now() self.collection.insert({"id":id ,"post_type":post_type ,"post_id":post_id ,"img":img_src ,"text":text ,"reg_date":reg_date ,"write_utime":write_utime ,"write_date":write_date}) text_enc = text.encode('utf-8') EnvPrint.log_info("current post count : {} ---------------------------------".format(post_num)) EnvPrint.log_info({"id":id ,"post_type":post_type ,"post_id":post_id ,"img":img_src ,"text":text_enc ,"reg_date":reg_date ,"write_utime":write_utime ,"write_date":write_date}) post_num = post_num + 1 self.deletePost(org_post) except Exception: self._driver.save_screenshot('img/{}'.format('screenshot_post_error.png'))
def scrape_tags_aco(self, number): """ scrape_tags method : scraping Instagram image URL & tags """ last_post_num_pre = 1 regexKo = re.compile( u"\s*([\u1100-\u11FF]|[\u3130-\u318F]|[\uA960-\uA97F]|[\uAC00-\uD7AF]|[\uD7B0-\uD7FF])\s*", re.UNICODE) while last_post_num_pre <= number: self._driver.execute_script(SCROLL_DOWN) time.sleep(0.2) self._driver.execute_script(SCROLL_UP) EnvPrint.log_info( "user count : {} ---------------------------------------". format(last_post_num_pre)) WebDriverWait(self._driver, 3).until( EC.presence_of_element_located(( By.XPATH, "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]" ))) explore_main_list_new = self._driver.find_elements_by_xpath( "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]" ) post_random = explore_main_list_new[0].find_elements_by_xpath( ".//a")[0] self._driver.get(post_random.get_attribute("href")) time.sleep(0.2) exp_single_post = WebDriverWait(self._driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//article[contains(@class, '_7hhq6')]"))) exp_article_src = BeautifulSoup( exp_single_post.get_attribute("innerHTML"), "html.parser") data_box = exp_article_src.find('div', class_='_ebcx9') ul = data_box.find('ul', class_='_b0tqa') li = ul.find_all('li')[0] cleanr = re.compile('<.*?>') text = re.sub(cleanr, '', str(li.span)) isKorean = False for ch in text: if regexKo.match(ch): isKorean = True break if not isKorean: last_post_num_pre = last_post_num_pre + 1 self._driver.back() pass # self.deletePost_aco(explore_main_list_new[0]) id_a = WebDriverWait(self._driver, 3).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "article._622au a._2g7d5"))) user_url = id_a.get_attribute("href") user_id = id_a.get_attribute("title") self._driver.get(user_url) today_post = True today_post_cnt = 0 today = None # while today_post: # time.sleep(0.2) # # WebDriverWait(self._driver, 3).until( # EC.presence_of_element_located((By.XPATH, # "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]")) # ) # # user_post_list_new = self._driver.find_elements_by_xpath( # "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]") # # if not len(user_post_list_new) <= today_post_cnt: # today_post = False # break # # user_cur_post = user_post_list_new[today_post_cnt].find_elements_by_xpath(".//a") # # if not user_cur_post: # today_post = False # pass # # user_cur_post = user_cur_post[0] # # post_url = user_cur_post.get_attribute("href") # # post_url_arr = post_url.split('/') # post_id = post_url_arr[len(post_url_arr) - 2] # # self._driver.get(post_url) # # time.sleep(0.2) # # single_post = WebDriverWait(self._driver, 10).until( # EC.presence_of_element_located((By.XPATH, "//article[contains(@class, '_7hhq6')]")) # ) # # article_src = BeautifulSoup(single_post.get_attribute("innerHTML"), "html.parser") # # data_box = article_src.find('div', class_='_ebcx9') # media_box = article_src.find('div', class_='_sxolz') # # write_date = data_box.find('time', class_='_p29ma').get('datetime') # write_date_ymd = write_date.split('T')[0] # if today_post_cnt == 0: # today = write_date_ymd # # if today_post_cnt == 0 : # today_post = True # else : # #date differ # if today != write_date_ymd: # today_post = False # pass # # EnvPrint.log_info("user's post count : {} ---------------------------------".format(today_post_cnt)) # # ul = data_box.find('ul', class_='_b0tqa') # li = ul.find_all('li')[0] # # cleanr = re.compile('<.*?>') # text = re.sub(cleanr, '', str(li.span)) # # isKorean = False # # for ch in text: # if regexKo.match(ch): # isKorean = True # break # # if isKorean: # # EnvPrint.log_info(text) # # media_src = media_box.find_all(['video', 'img'])[0].get('src') # EnvPrint.log_info(media_src) # # reg_date = datetime.datetime.now() # # if text and today_post: # with open(self.csv_file_loc) as f: # csvreader = csv.reader(f) # exist_ids = [row[0] for row in csvreader] # # if post_id in exist_ids: # pass # else: # with open(self.csv_file_loc, 'a') as file: # # post_id,user_id,img,text,write_date,reg_date # # csvwriter = csv.writer(file) # csvwriter.writerow([post_id, user_id, media_src, text, write_date, reg_date]) # # text_enc = text.encode('utf-8') # # EnvPrint.log_info({"post_id": post_id # , "user_id": user_id # , "img": media_src # , "text": text_enc # , "write_date": write_date # , "reg_date": reg_date}) # # today_post_cnt = today_post_cnt + 1 # self._driver.back() last_post_num_pre = last_post_num_pre + 1 self._driver.back() self._driver.back()