def crawlNews(oid, processNo, pushedNo, startTime): while True: try: _, _, newsRawDB = connectDB(host) metadataCollection = newsRawDB['metadata'] try: startNo = metadataCollection.find_one({"oid": oid})['last'] except: startNo = 1 tmpDB = [] cnt = 0 pushedNo.value += startNo - 1 log('Process oid=%03d started at aid=%d' % (oid, startNo), startTime, processNo, pushedNo.value) for i in range(startNo, 999999999): status, newsResponseText, summary = getRaw(oid, i) if not status: continue tmpDB.append({ 'body': newsResponseText, 'summary': summary, 'aid': i }) cnt += 1 if cnt >= chunk: if len(tmpDB) > 0: newsRawDB[str(oid)].insert_many(tmpDB) pushedNo.value += len(tmpDB) log( 'Pushed %03d objects to DB at oid=%03d for aid=%d' % (len(tmpDB), oid, i), startTime, processNo, pushedNo.value) tmpDB = [] cnt = 0 try: metadataCollection.delete_one({"oid": oid}) metadataCollection.insert_one({"oid": oid, "last": i}) except: pass except: pass
return { 'newsId': aid, 'title': newsTitle, 'body': newsText, 'summary': summaryText, 'category': category, 'publishTime': publishTime, 'editedTime': editedTime } if __name__ == '__main__': multiprocessing.freeze_support() oid = int(input()) newsDB, categoryDB, __ = connectDB(host) metadataCollection = newsDB['metadata'] try: i = metadataCollection.find_one({"oid": oid})['last'] except: i = 1 while True: with multiprocessing.Pool(processes=processNo) as pool: newsList = list( filter( partial(is_not, None), tqdm(pool.imap_unordered( getNews, [(oid, x) for x in range(i, i + processNo * batch)]), total=processNo * batch, desc="Batch %d - %d" %
news = [] if not status: return False, newsText = "" newsSoup = BeautifulSoup(html, 'html.parser') els = newsSoup.select('div.fcItem_top.clearfix') for i in els: try: body = i.select('a')[0].text factJson = demjson.decode( i.select('script')[0].text.strip()[14:-2].strip()) score = np.mean(list(factJson['score'].values())) if score > 0: news.append({'body': body, 'score': score}) except: pass return news newsDB, *_ = connectDB(host) page = 1 tot = 0 while True: li = crawlNews(page) if len(li) > 0: newsDB['snu'].insert_many(li) tot += len(li) print('Pushed %d objects' % tot) page += 1
def parseNews(oid, processNo, parsedNo, startTime): while 1: try: log('Process oid=%03d started.' % oid, 0, 0, 0) newsDB, categoryDB, newsRawDB = connectDB(host) while 1: li = list(newsRawDB[str(oid)].find().limit(chunk)) if len(li) == 0: return log('Got %d Data from DB at oid=%03d' % (len(li), oid), startTime, processNo, parsedNo.value) removeLi = [] processedNews = [] categoryDict = dict() for news in li: try: removeLi.append({'_id': news['_id']}) aid, body, summary = news['aid'], news['body'], news[ 'summary'] summarySoup = BeautifulSoup(summary['summary'], 'html.parser') summaryText = summarySoup.get_text() newsText = "" newsSoup = BeautifulSoup(body, 'html.parser') bodyEl = newsSoup.find(id="articleBodyContents") for i in bodyEl: if type(i) is NavigableString: newsText += i elif type(i) is Comment: pass else: if i.name == 'br': newsText += '\n' if i.get('data-type') == 'ore': newsText += i.get_text() newsText = newsText.replace('\n\n', '\n') newsText = newsText.replace('\n', ' ') newsText = newsText.replace(' ', ' ') newsText = newsText.strip().decode( 'utf-8', 'ignore').encode("utf-8") newsTitle = newsSoup.find( id="articleTitle").get_text().strip() category = [] for i in newsSoup.find_all( "em", {"class": "guide_categorization_item"}): category.append(sectionName[i.get_text()]) if sectionName[i.get_text()] not in categoryDict: categoryDict[sectionName[i.get_text()]] = [] categoryDict[sectionName[i.get_text()]].append({ 'oid': oid, 'aid': aid }) publishTime = strToDate( newsSoup.find_all("span", {"class": "t11"})[0].get_text()) if len(newsSoup.find_all("span", {"class": "t11"})) == 2: editedTime = strToDate( newsSoup.find_all( "span", {"class": "t11"})[1].get_text()) else: editedTime = strToDate( newsSoup.find_all( "span", {"class": "t11"})[0].get_text()) processedNews.append({ 'newsId': aid, 'title': newsTitle, 'body': newsText, 'summary': summaryText, 'category': category, 'publishTime': publishTime, 'editedTime': editedTime }) except: pass for section, data in categoryDict.items(): categoryDB[section].insert_many(data) if len(processedNews) > 0: newsDB[str(oid)].insert_many(processedNews) parsedNo.value += len(processedNews) log( 'Parsed %03d objects in DB at oid=%03d' % (len(processedNews), oid), startTime, processNo, parsedNo.value) for remove in removeLi: newsRawDB[str(oid)].delete_one(remove) log('Dropped %03d objects in RAW at oid=%03d' % (chunk, oid), startTime, processNo, parsedNo.value) except: pass