def YahooSearch(keyword): urls = [] resultUrls = [] quantity = GetTheConfig('yahoo','QUANTITY') page = 1 user_agent = { 'User-Agent': choice(user_agent_list)} while True: response = get('http://search.yahoo.co.jp/search?p=%s&b=%d' % (keyword, page), headers=user_agent, timeout=5) html = response.text.encode('utf-8') soup = BeautifulSoup(html) elements = soup.findAll('div', {'class': 'hd'}) for url in elements: for a in url.find_all('a', href=True): urls.append(a['href']) if page // 10 == int(quantity) // 10: break page = page + 10 urls = DeduplicateValue(urls) for url in urls: resultUrls.append({"Yahoo": url}) return resultUrls
def FacebookSearch(keyword): urls = [] resultUrls = [] response = get("https://graph.facebook.com/search?access_token=" + access_token + "&q=" + keyword + "&type=page") data = response.text.encode('utf-8') jsonData = json.loads(data) page = 1 user = jsonData["data"][0]["id"] # 값 하나하나 접근하기 graph = facebook.GraphAPI(access_token) profile = graph.get_object(user) posts = graph.get_connections(profile['id'], connection_name='posts') while True: for post in posts['data']: urls.append({"Facebook":GetTheUrl(post=post)}) try: posts = get(posts['paging']['next']).json() except: posts = False finally: if not posts or page == GetTheConfig('facebook', 'QUANTITY'): break page = page + 1 urls = DeduplicateValue(urls) for url in urls: resultUrls.append({"Facebook": url}) return resultUrls
def GoogleSearch(keyword): urls = [] resultUrls = [] prefixs = { 'filetype:doc ', 'filetype:pdf ', 'filetype:ppt ', 'filetype:hwp ', 'filetype:txt ' } # 별도의 conf 파일로 분리 예정 urls = RequestSearch(urls, keyword) for prefix in prefixs: urls = RequestSearch(urls, prefix + keyword) urls = DeduplicateValue(urls) for url in urls: resultUrls.append({"Google": url}) return resultUrls
def BingSearch(keyword): urls = [] bing_web = PyBingWebSearch(GetTheConfig('bing', 'Key'), keyword, web_only=False) results = bing_web.search(limit=int(GetTheConfig('bing', 'QUANTITY')), format='json') for result in results: url = result.url urls.append(url) results = DeduplicateValue(urls) urls = [] for result in results: urls.append({"Bing": result}) return urls
def BaiduSearch(keyword): urls = [] count = -1 page = int(GetTheConfig('baidu', 'QUANTITY')) + 1 regexStoredBaiduUrl = re.compile(r'url":"(.*?)"}') while count < page: count = count + 1 paging = 10 * count html = getHtml("http://www.baidu.com/s?wd=%s&pn=%s" % (keyword, str(paging))) if html == "Online Shield": continue storedBaiduUrls = getElement(regexStoredBaiduUrl, html) realUrls = getRealUrl(storedBaiduUrls) realUrls = DeduplicateValue(realUrls) for url in realUrls: urls.append({"Baidu": url}) return urls
def TwitterSearch(keyword): urls = [] resultUrls = [] hangul = re.compile('[ㄱ-ㅣ가-힣]+') # 한글과 띄어쓰기를 제외한 모든 글자 # 트위터 API 사용 인증용 토큰 값 ACCESS_TOKEN = GetTheConfig('twitter', 'ACCESS_TOKEN') ACCESS_SECRET = GetTheConfig('twitter', 'ACCESS_SECRET') CONSUMER_KEY = GetTheConfig('twitter', 'CONSUMER_KEY') CONSUMER_SECRET = GetTheConfig('twitter', 'CONSUMER_SECRET') oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET) twitter = Twitter(auth=oauth) params = { 'result_type': 'recent', 'count': int(GetTheConfig('twitter', 'QUANTITY')) } if (hangul.match(keyword)): params['lang'] = 'ko' params['q'] = keyword query = twitter.search.tweets(**params) for resultEntries in query["statuses"]: resultEntry = resultEntries["entities"]["urls"] for url in resultEntry: urls.append(url[u"expanded_url"]) urls = DeduplicateValue(urls) for url in urls: resultUrls.append({"Twitter": url}) return resultUrls
def SavingSearchData(searchResults): Unspecified = 1 workerIndex = 0 worker = [u"이승용", u"이상훈", u"김성규", u"하동민"] databaseStoredUrls = [] sourceImageUrls = None matchWorkerUrls = [] imageUrl = None if searchResults == []: # 검색된 값이 없는 경우 IncrementSearchingQuantity() # 검색 결과에서 URL 만 추출 sourceUrls, sourceImageUrls = ExtractSearchUrl(searchResults) # 중복 확인 StoredSearchUrls = executeNfetchall( GetTheConfig('query', 'SELECT_DATA_RESEARCH-RESEARCHURL')) for StoredSearchUrl in StoredSearchUrls: databaseStoredUrls.append(StoredSearchUrl['researchUrl']) UniqueUrls = DeduplicateValue( sourceUrls, databaseStoredUrls) # 데이터베이스에서 조회한 데이터와 비교하여 중복된 값 제거 if UniqueUrls == []: # 데이터베이스에서 조회한 데이터와 비교 후 고유 값이 없으면 검색 량 증가 IncrementSearchingQuantity() for url in UniqueUrls: for sourceImageUrl in sourceImageUrls: # 썸네일 이미지 if url == sourceImageUrl.keys()[0]: imageUrl = sourceImageUrl[url] break for searchResult in searchResults: # 플랫폼 종류 for urlDictionary in searchResult: if urlDictionary.keys()[0] == "Youtube": if url == urlDictionary[urlDictionary.keys()[0]]["url"]: platform = urlDictionary.keys()[0] break else: if url == urlDictionary.values(): platform = urlDictionary.keys()[0] break siteUrl = ExtractDomain(url) # 도메인 추출 matchWorkerUrls.append({ 'worker': worker[workerIndex] # 담당자 지정 , 'platform': platform, 'siteUrl': siteUrl, 'imageUrl': imageUrl, 'url': url }) workerIndex = workerIndex + 1 if workerIndex == 4: workerIndex = 0 # 데이터베이스 저장 for matchWorkerUrl in matchWorkerUrls: print matchWorkerUrl executeNcommit( GetTheConfig('query', 'INSERT_DATA'), (matchWorkerUrl['worker'], '', matchWorkerUrl['siteUrl'], matchWorkerUrl['url'], matchWorkerUrl['platform'], Unspecified, matchWorkerUrl['imageUrl'], str(now()), str(now('limit')))) return True