示例#1
0
	def scrape_photo_links(self, number, is_hashtag=False):
		EnvPrint.log_info("Scraping photo links...")
		encased_photo_links = re.finditer(r'src="([https]+:...[\/\w \.-]*..[\/\w \.-]*'
								r'..[\/\w \.-]*..[\/\w \.-].jpg)', self._driver.page_source)
		
		photo_links = [m.group(1) for m in encased_photo_links]
		EnvPrint.log_info(photo_links,"pprint")
示例#2
0
	def logoutAndLogin(self):
		self._driver.get(urljoin(self.setting['FACEBOOK_DOMAIN'], "accounts/logout"))

		self._driver.get(urljoin(self.setting['FACEBOOK_DOMAIN'], "accounts/login/"))

		EnvPrint.log_info("Since Instagram provides 5000 post views per Hour, relogin with annother username and password loaded from {}".format(authentication))
		
		# Input username
		try:
			username_input = WebDriverWait(self._driver, 5).until(
				EC.presence_of_element_located((By.NAME, 'email'))
			)
			username_input.send_keys(self.auth_dict["FACEBOOK"][self.accountIdx]['username'])

		except Exception:
			self._driver.save_screenshot('img/{}'.format('screenshot_relogin_01.png'))

		# Input password
		try:
			password_input = WebDriverWait(self._driver, 5).until(
				EC.presence_of_element_located((By.NAME, 'pass'))
			)
			password_input.send_keys(self.auth_dict["FACEBOOK"][self.accountIdx]['password'])
			# Submit
			password_input.submit()
			
		except Exception:
			self._driver.save_screenshot('img/{}'.format('screenshot_relogin_02.png'))
		
		WebDriverWait(self._driver, 60).until(
			EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE))
		)
        async def savePost(new_list):
            ignore_num = 0

            for i in range(0, len(new_list)):

                ahref_arr = new_list[i].find_elements_by_xpath(
                    ".//a")[0].get_attribute("href").split('/')
                id = ahref_arr[len(ahref_arr) - 2]

                img_src = new_list[i].find_elements_by_xpath(
                    ".//img[@class='_2di5p']")[0].get_attribute("src")
                text = new_list[i].find_elements_by_xpath(
                    ".//img[@class='_2di5p']")[0].get_attribute("alt")
                reg_date = datetime.datetime.now()
                write_date = None

                try:
                    response = requests.head(img_src, timeout=1)
                    write_date = response.headers["last-modified"]

                except requests.exceptions.Timeout:
                    write_date = ""
                finally:
                    if text:
                        exist_ids = None
                        with open(self.csv_file_loc) as f:
                            csvreader = csv.reader(f)
                            exist_ids = [row[0] for row in csvreader]

                        if id in exist_ids:
                            ignore_num = ignore_num + 1
                        else:

                            with open(self.csv_file_loc, 'a') as file:
                                # file.write("{},{},{},{},{},{}\n".format(id, img_src, text, self.query, write_date, reg_date))

                                csvwriter = csv.writer(file)
                                csvwriter.writerow([
                                    id, img_src, text, self.query, write_date,
                                    reg_date
                                ])

                            text_enc = text.encode('utf-8')

                            EnvPrint.log_info(
                                {
                                    "id": id,
                                    "img": img_src,
                                    "text": text_enc,
                                    "has_tag": self.query,
                                    "write_date": write_date,
                                    "reg_date": reg_date
                                }, "debug")

            last_post_num_new = len(new_list) - ignore_num

            return last_post_num_new
示例#4
0
def main():
	#   Arguments  #
	parser = argparse.ArgumentParser(description='Pengtai Instagram Crawler')
	parser.add_argument('-d', '--dir_prefix', type=str,
		default='./data/', help='directory to save results')
	parser.add_argument('-q', '--query', type=str, 
		help="target to crawl, add '#' for hashtags")
	parser.add_argument('-t', '--crawl_type', type=str,
		default='all', help="Options: 'all' | 'tags' | 'photos' | 'following'")
	parser.add_argument('-n', '--number', type=int, default=0,
		help='Number of posts to download: integer')
	parser.add_argument('-l', '--headless', action='store_true',
		help='If set, will use PhantomJS driver to run script as headless')
	parser.add_argument('-a', '--authentication', type=str, default='auth.json',
		help='path to authentication json file')
	parser.add_argument('-s', '--setting', type=str, default='settings.json',
		help='path to setting json file')
	parser.add_argument('-e', '--env', type=str, default='pro',
		help="environment options: 'pro' | 'dev' | 'test'")
	parser.add_argument('-r', '--random', action='store_true',
		help='enables tags mode with random hashtags @ setting.json')

	args = parser.parse_args()
	#  End Argparse #

	nowDate = now.strftime("%Y%m%d")
	filename = './logs/log-'+args.env+'.'+nowDate+'.log'
	FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'

	if args.env == "pro":
		logging.basicConfig(filename=filename, level=logging.INFO, format=FORMAT)

	elif args.env == "dev":
		logging.basicConfig(filename=filename,level=logging.DEBUG)
		root = logging.getLogger()
		ch = logging.StreamHandler(sys.stdout)
		ch.setLevel(logging.DEBUG)
		formatter = logging.Formatter(FORMAT)
		ch.setFormatter(formatter)
		root.addHandler(ch)

	EnvPrint.env = args.env

	EnvPrint.log_info("=========================================")
	EnvPrint.log_info(args)

	crawler = FacebookCrawler(headless=args.headless, setting_path = args.setting)
	crawler.crawl(dir_prefix=args.dir_prefix,
		query=args.query,
		crawl_type=args.crawl_type,
		number=args.number,
		authentication=args.authentication,
		is_random=args.random)
    def __init__(self, headless=True, setting_path='settings.json'):
        # Setting
        with open(setting_path) as data_file:
            self.setting = json.load(data_file)

        if headless:
            EnvPrint.log_info("headless mode on")
            self._driver = webdriver.PhantomJS(self.setting['PHANTOMJS_PATH'])
            self._driver.set_window_size(1120, 550)
        else:
            self._driver = webdriver.Firefox()

        self._driver.implicitly_wait(10)
        self.data = defaultdict(list)
    def __init__(self, headless=True, setting_path='settings.json'):
        # Setting
        with open(setting_path) as data_file:
            self.setting = json.load(data_file)

        if headless:
            EnvPrint.log_info("headless mode on")
            self._driver = webdriver.PhantomJS(
                "{}/node_modules/phantomjs/bin/phantomjs".format(
                    os.path.dirname(os.path.abspath(__file__))))
            self._driver.set_window_size(1120, 550)
        else:
            self._driver = webdriver.Firefox()

        self._driver.implicitly_wait(10)
        self.data = defaultdict(list)
    def crawl(self, csv_file_loc, query, crawl_type, number, authentication,
              is_random):
        EnvPrint.log_info(
            "crawl_type: {}, number: {}, authentication: {}, is_random: {}".
            format(crawl_type, number, authentication, is_random))

        # !! CHANGE FROM DB CONNECTION TO FILE SYSTEM !!

        self.csv_file_loc = csv_file_loc

        self.crawl_type = crawl_type
        self.is_random = is_random

        if self.crawl_type == "tags":

            if is_random:
                self.query = random.choice(self.setting["HASHTAGS"])
            else:
                self.query = query

            self.crawl_type = crawl_type
            self.accountIdx = 0
            self.totalNum = number
            self.refresh_idx = 0
            self.login(authentication)
            self.browse_target_page()

            try:
                self.scrape_tags(number)
            except Exception:
                EnvPrint.log_info("Quitting driver...")
                self.quit()
        else:
            self.accountIdx = 0
            self.totalNum = number
            self.refresh_idx = 0
            self.login(authentication)
            self.browse_target_page()
            try:
                self.scrape_tags(number)
            except Exception:
                EnvPrint.log_info("Quitting driver...")
                self.quit()
        # 	EnvPrint.log_info("Unknown crawl type: {}".format(crawl_type))
        # 	self.quit()
        # 	return

        #Quit driver
        EnvPrint.log_info("Quitting driver...")
        self.quit()
示例#8
0
	def download_and_save(self, dir_prefix, query, crawl_type):
		# Check if is hashtag
		dir_name = query.lstrip(
			'#') + '.hashtag' if query.startswith('#') else query

		dir_path = os.path.join(dir_prefix, dir_name)
		if not os.path.exists(dir_path):
			os.makedirs(dir_path)

		EnvPrint.log_info("Saving to directory: {}".format(dir_path))

		# Save Photos
		for idx, photo_link in enumerate(self.data['photo_links'], 0):
			sys.stdout.write("\033[F")
			EnvPrint.log_info("Downloading {} images to ".format(idx + 1))
			# Filename
			_, ext = os.path.splitext(photo_link)
			filename = str(idx) + ext
			filepath = os.path.join(dir_path, filename)
			# Send image request
			urlretrieve(photo_link, filepath)

		# Save Captions
		for idx, caption in enumerate(self.data['captions'], 0):

			filename = str(idx) + '.txt'
			filepath = os.path.join(dir_path, filename)

			with codecs.open(filepath, 'w', encoding='utf-8') as fout:
				fout.write(caption + '\n')

		# Save followers/following
		filename = crawl_type + '.txt'
		filepath = os.path.join(dir_path, filename)
		if len(self.data[crawl_type]):
			with codecs.open(filepath, 'w', encoding='utf-8') as fout:
				for fol in self.data[crawl_type]:
					fout.write(fol + '\n')
示例#9
0
	def __init__(self, headless=True, setting_path='settings.json'):
		# Setting 
		with open(setting_path) as data_file:
			self.setting = json.load(data_file)

		if headless:
			EnvPrint.log_info("headless mode on")
			self._driver = webdriver.PhantomJS(self.setting['PHANTOMJS_PATH'])
			self._driver.set_window_size(1120, 550)
		else:
			self._driver = webdriver.Firefox()

		self._driver.implicitly_wait(10)
		self.data = defaultdict(list)
		
		# DB connection
		connection = pymongo.MongoClient(self.setting['DB_HOST'], self.setting['DB_PORT'])

		db_name = self.setting['DB_NAME']
		self.db = connection[db_name]
		
		collectionName = "fb-explore-{}-Collection".format(now.strftime("%Y-%m-%d"))
		self.collection = self.db[collectionName]
示例#10
0
        async def loop_func(last_post_num, load_idx, loop):
            last_post_num_pre = last_post_num
            load_idx = load_idx

            while last_post_num_pre <= number:
                self._driver.execute_script(SCROLL_DOWN)
                time.sleep(0.2)

                # explore_main_list_new = await get_new_posts()

                try:
                    WebDriverWait(self._driver, 3).until(
                        EC.presence_of_element_located((
                            By.XPATH,
                            "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]"
                        )))

                    explore_main_list_new = self._driver.find_elements_by_xpath(
                        "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]"
                    )

                    if last_post_num_pre >= len(explore_main_list_new):
                        continue

                    last_post_num_new = await savePost(explore_main_list_new)

                    load_idx = load_idx + 1
                    cur_post_count = last_post_num_pre + last_post_num_new

                    if self.crawl_type == "tags":
                        EnvPrint.log_info(
                            "current post count : {}, tags : {} ---------------------------------"
                            .format(cur_post_count, self.query))
                    else:
                        EnvPrint.log_info(
                            "current post count : {} ---------------------------------"
                            .format(cur_post_count))

                    EnvPrint.log_info(
                        "post crawling done ------------------------------------------",
                        "debug")

                    last_post_num_pre = cur_post_count

                    # await deletePost(last_post_num_new)
                except Exception:
                    self._driver.save_screenshot(
                        'img/{}'.format('screenshot_post_error.png'))

                    # error_box = self._driver.find_elements_by_xpath("//div[contains(@class, '_fb78b')]")
                    # if last_post_num_new == 0:
                    # 	self.leftover_num = number - last_post_num
                    # 	raise Exception("error")

            loop.stop()
示例#11
0
	def crawl(self, dir_prefix, query, crawl_type, number, authentication, is_random):
		EnvPrint.log_info("crawl_type: {}, number: {}, authentication: {}, is_random: {}"
			.format(crawl_type, number, authentication, is_random))
		
		self.crawl_type = crawl_type
		self.is_random = is_random

		if self.crawl_type == "tags":

			if is_random:
				self.query = random.choice(self.setting["HASHTAGS"])
			else:
				self.query = query

			self.crawl_type = crawl_type
			self.accountIdx = 0
			self.totalNum = number
			self.refresh_idx = 0
			self.login(authentication)
			self.browse_target_page()
			try:
				self.scrape_tags(number)
			except Exception:
				EnvPrint.log_info("Quitting driver...")
				self.quit()
		else:
			self.accountIdx = 0
			self.totalNum = number
			self.refresh_idx = 0
			self.login(authentication)

			try:
				self.scrape_tags(number)
			except Exception:
				EnvPrint.log_info("Quitting driver...")
				self.quit()
			
		# Quit driver
		EnvPrint.log_info("Quitting driver...")
		self.quit()
示例#12
0
    def login(self, authentication=None):
        """
			authentication: path to authentication json file
		"""
        self._driver.get(
            urljoin(self.setting['INSTA_DOMAIN'], "accounts/login/"))

        if authentication:
            EnvPrint.log_info(
                "Username and password loaded from {}".format(authentication))
            # print("Username and password loaded from {}".format(authentication))
            with open(authentication, 'r') as fin:
                self.auth_dict = json.loads(fin.read())

            # Input username
            try:
                username_input = WebDriverWait(self._driver, 5).until(
                    EC.presence_of_element_located((By.NAME, 'username')))
                username_input.send_keys(
                    self.auth_dict["INSTAGRAM"][self.accountIdx]['username'])
            except Exception:
                self._driver.save_screenshot(
                    'img/{}'.format('screenshot_login_01.png'))

            # Input password
            try:
                password_input = WebDriverWait(self._driver, 5).until(
                    EC.presence_of_element_located((By.NAME, 'password')))
                password_input.send_keys(
                    self.auth_dict["INSTAGRAM"][self.accountIdx]['password'])

                # Submit
                password_input.submit()
            except Exception:
                self._driver.save_screenshot(
                    'img/{}'.format('screenshot_login_02.png'))

        else:
            EnvPrint.log_info(
                "Type your username and password by hand to login!")
            EnvPrint.log_info("You have a minute to do so!")

        WebDriverWait(self._driver, 60).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE)))
示例#13
0
	def scrape_tags(self, number):
		"""
			scrape_tags method : scraping Instagram image URL & tags
		"""
		post_num = 0

		while post_num < number:

			try:
				self._driver.execute_script(SCROLL_DOWN)
				time.sleep(0.2)
				self._driver.execute_script(SCROLL_UP)
				time.sleep(0.2)

				main_post = self._driver.find_elements_by_xpath("//div[contains(@class, '_4ikz')]")
				org_post = main_post[0]
				post = main_post[0]
				# post = main_post[post_num]
				
				while len(post.find_elements_by_xpath(".//div[contains(@class, '_5pcr') and contains(@class,'fbUserStory')]")):
					post = post.find_elements_by_xpath(".//div[contains(@class, '_5pcr') and contains(@class,'fbUserStory')]")[0]

				see_more_link = post.find_elements_by_xpath(".//a[contains(@class, 'see_more_link')]")

				id = ""
				post_type = ""
				post_id = ""

				if see_more_link :
					link_data = see_more_link[0].get_attribute("href")
					if link_data != "#":
						link_data = link_data.split('?')[0]
						link_data = link_data.replace("https://www.facebook.com/","")
						link_data = link_data.split('/')
						id = link_data[0]
						post_type = link_data[1]
						post_id = link_data[2]

				write_utime_ele = post.find_elements_by_xpath(".//abbr[contains(@class, '_5ptz') and contains(@class, 'timestamp')]")
				write_date = ""
				write_utime = ""

				if write_utime_ele:
					write_utime = write_utime_ele[0].get_attribute("data-utime")
					write_utime = int(write_utime)
					write_date = datetime.utcfromtimestamp(write_utime).isoformat()
					time_atag_href = write_utime_ele[0].find_elements_by_xpath("..")[0].get_attribute("href")
					link_data = time_atag_href.replace("https://www.facebook.com/","")
					# link_data = time_atag_href[1:].split('/')
					link_data = link_data.split('/')
					if(link_data[0] == "groups"):
						id = link_data[1]
						post_type = link_data[0]
						post_id = link_data[2]+'/'+link_data[3]
					else:
						id = link_data[0]
						post_type = link_data[1]
						post_id = link_data[2]
					
				
				text = post.find_elements_by_xpath(".//div[contains(@class, '_5pbx') and contains(@class, 'userContent')]")
				if text:
					text = text[0].get_attribute("innerHTML")
					cleanr = re.compile('<.*?>')
					text = re.sub(cleanr, '', text)
				else:
					text = ""

				img_src_arr = post.find_elements_by_xpath(".//div[contains(@class, '_1dwg') and contains(@class, '_1w_m')]//div[contains(@class, '_3x-2')]//img[@src]")
				img_src = ""

				if img_src_arr:
					img_src = img_src_arr[0].get_attribute("src")

				if self.collection.find({
					"id":id, 
					"post_type":post_type,
					"post_id":post_id,
					"write_utime":write_utime
				}).count() == 0:

					reg_date = datetime.now()
					
					self.collection.insert({"id":id
						,"post_type":post_type
						,"post_id":post_id
						,"img":img_src
						,"text":text
						,"reg_date":reg_date
						,"write_utime":write_utime
					,"write_date":write_date})

					text_enc = text.encode('utf-8')

					EnvPrint.log_info("current post count : {} ---------------------------------".format(post_num))

					EnvPrint.log_info({"id":id
						,"post_type":post_type
						,"post_id":post_id
						,"img":img_src
						,"text":text_enc
						,"reg_date":reg_date
						,"write_utime":write_utime
					,"write_date":write_date})

					post_num = post_num + 1
				
				self.deletePost(org_post)

			except Exception:
				self._driver.save_screenshot('img/{}'.format('screenshot_post_error.png'))
        def scrape_tags_aco(self, number):
            """
                scrape_tags method : scraping Instagram image URL & tags
            """

            last_post_num_pre = 1
            regexKo = re.compile(
                u"\s*([\u1100-\u11FF]|[\u3130-\u318F]|[\uA960-\uA97F]|[\uAC00-\uD7AF]|[\uD7B0-\uD7FF])\s*",
                re.UNICODE)

            while last_post_num_pre <= number:
                self._driver.execute_script(SCROLL_DOWN)
                time.sleep(0.2)
                self._driver.execute_script(SCROLL_UP)

                EnvPrint.log_info(
                    "user count : {} ---------------------------------------".
                    format(last_post_num_pre))

                WebDriverWait(self._driver, 3).until(
                    EC.presence_of_element_located((
                        By.XPATH,
                        "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]"
                    )))

                explore_main_list_new = self._driver.find_elements_by_xpath(
                    "//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]"
                )

                post_random = explore_main_list_new[0].find_elements_by_xpath(
                    ".//a")[0]

                self._driver.get(post_random.get_attribute("href"))
                time.sleep(0.2)

                exp_single_post = WebDriverWait(self._driver, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//article[contains(@class, '_7hhq6')]")))

                exp_article_src = BeautifulSoup(
                    exp_single_post.get_attribute("innerHTML"), "html.parser")

                data_box = exp_article_src.find('div', class_='_ebcx9')
                ul = data_box.find('ul', class_='_b0tqa')
                li = ul.find_all('li')[0]

                cleanr = re.compile('<.*?>')
                text = re.sub(cleanr, '', str(li.span))

                isKorean = False

                for ch in text:
                    if regexKo.match(ch):
                        isKorean = True
                        break

                if not isKorean:
                    last_post_num_pre = last_post_num_pre + 1
                    self._driver.back()
                    pass

                # self.deletePost_aco(explore_main_list_new[0])

                id_a = WebDriverWait(self._driver, 3).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "article._622au a._2g7d5")))

                user_url = id_a.get_attribute("href")
                user_id = id_a.get_attribute("title")

                self._driver.get(user_url)

                today_post = True
                today_post_cnt = 0
                today = None

                # while today_post:
                # 	time.sleep(0.2)
                #
                # 	WebDriverWait(self._driver, 3).until(
                # 		EC.presence_of_element_located((By.XPATH,
                # 			"//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]"))
                # 	)
                #
                # 	user_post_list_new = self._driver.find_elements_by_xpath(
                # 		"//div[contains(@class, '_mck9w') and contains(@class,'_gvoze') and contains(@class,'_f2mse')]")
                #
                # 	if not len(user_post_list_new) <= today_post_cnt:
                # 		today_post = False
                # 		break
                #
                # 	user_cur_post = user_post_list_new[today_post_cnt].find_elements_by_xpath(".//a")
                #
                # 	if not user_cur_post:
                # 		today_post = False
                # 		pass
                #
                # 	user_cur_post = user_cur_post[0]
                #
                # 	post_url = user_cur_post.get_attribute("href")
                #
                # 	post_url_arr = post_url.split('/')
                # 	post_id = post_url_arr[len(post_url_arr) - 2]
                #
                # 	self._driver.get(post_url)
                #
                # 	time.sleep(0.2)
                #
                # 	single_post = WebDriverWait(self._driver, 10).until(
                # 		EC.presence_of_element_located((By.XPATH, "//article[contains(@class, '_7hhq6')]"))
                # 	)
                #
                # 	article_src = BeautifulSoup(single_post.get_attribute("innerHTML"), "html.parser")
                #
                # 	data_box = article_src.find('div', class_='_ebcx9')
                # 	media_box = article_src.find('div', class_='_sxolz')
                #
                # 	write_date = data_box.find('time', class_='_p29ma').get('datetime')
                # 	write_date_ymd = write_date.split('T')[0]
                # 	if today_post_cnt == 0:
                # 		today = write_date_ymd
                #
                # 	if today_post_cnt == 0 :
                # 		today_post = True
                # 	else :
                # 		#date differ
                # 		if today != write_date_ymd:
                # 			today_post = False
                # 			pass
                #
                # 	EnvPrint.log_info("user's post count : {} ---------------------------------".format(today_post_cnt))
                #
                # 	ul = data_box.find('ul', class_='_b0tqa')
                # 	li = ul.find_all('li')[0]
                #
                # 	cleanr = re.compile('<.*?>')
                # 	text = re.sub(cleanr, '', str(li.span))
                #
                # 	isKorean = False
                #
                # 	for ch in text:
                # 		if regexKo.match(ch):
                # 			isKorean = True
                # 			break
                #
                # 	if isKorean:
                #
                # 		EnvPrint.log_info(text)
                #
                # 		media_src = media_box.find_all(['video', 'img'])[0].get('src')
                # 		EnvPrint.log_info(media_src)
                #
                # 		reg_date = datetime.datetime.now()
                #
                # 		if text and today_post:
                # 			with open(self.csv_file_loc) as f:
                # 				csvreader = csv.reader(f)
                # 				exist_ids = [row[0] for row in csvreader]
                #
                # 			if post_id in exist_ids:
                # 				pass
                # 			else:
                # 				with open(self.csv_file_loc, 'a') as file:
                # 					# post_id,user_id,img,text,write_date,reg_date
                #
                # 					csvwriter = csv.writer(file)
                # 					csvwriter.writerow([post_id, user_id, media_src, text, write_date, reg_date])
                #
                # 				text_enc = text.encode('utf-8')
                #
                # 				EnvPrint.log_info({"post_id": post_id
                # 								  , "user_id": user_id
                # 								  , "img": media_src
                # 								  , "text": text_enc
                # 								  , "write_date": write_date
                # 								  , "reg_date": reg_date})
                #
                # 	today_post_cnt = today_post_cnt + 1
                # 	self._driver.back()

                last_post_num_pre = last_post_num_pre + 1
                self._driver.back()
                self._driver.back()