def feed_parsing(file, multi): if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context # to solve SSLCertVerificationError problem link_list = ['https://www.computerworld.com/index.rss', 'https://www.recode.net/rss/index.xml'] general_url = re.compile('(http(s)?:\/\/)([a-z0-9\w]+\.*)+[a-z0-9]{2,4}') for link in link_list: d = feedparser.parse(link) # print(source) for i in range(len(d.entries)): title = d['entries'][i]['title'] content = clean_html(d['entries'][i]['description']).replace(u'\xa0',' ').replace('\t',' ').replace('<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") link = d['entries'][i]['link'] source = general_url.search(d['entries'][i]['link']).group().split('.')[1] if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source, False) else: keyword_list = getKeywords(title.lower(), content.lower(), source, False) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" created_at = time.strftime("%Y-%m-%d", d['entries'][i].get("published_parsed", time.gmtime())).split("T")[0] file.writerow([title, content[:200] + "...", link, 0, source, _keyword, "", created_at, 0]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([title, content[:200] + "...", link, 0, source, _keyword, "", created_at, 0]) updater.close()
def getData(file, link, multi): r = requests.get(link) content_source = r.text content_soup = BeautifulSoup(content_source, "lxml") title = content_soup.find("h1", { "class": "post-title" }).text.replace(u'\xa0', ' ').replace('\t', ' ').lstrip().rstrip().replace( "\"", "").replace("\'", "") created_at = content_soup.find("time")['datetime'].split("T")[0] content = "" section_content = content_soup.find_all("div", {"class": "post-content"}) for s in section_content: image = "" img = s.find("img") if img is not None: if "gravatar" not in img['src']: if img['src'].startswith("/"): image = "http://woowabros.github.io" + img['src'] print(image) contents = s.find_all("p") for c in contents: content += c.text content = content.replace(u'\xa0', ' ').replace('\t', ' ').replace( '<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") source = "woowabros" if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source) else: keyword_list = getKeywords(title.lower(), content.lower(), source) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" file.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater.close()
def getData(file, link, multi): r = requests.get(link) content_source = r.text content_soup = BeautifulSoup(content_source, "lxml") title = content_soup.select_one('h1.entry-title').text.replace(u'\xa0',' ').replace('\t',' ').lstrip().rstrip().replace("\"", "").replace("\'", "") s = content_soup.select_one('span.post-date').text try: created_at = datetime.strptime(s, ' %B %d, %Y').strftime('%Y-%m-%d') except ValueError: d = int(datetime.today().day) - int(s[1]) if(d < 10): created_at = datetime.today().strftime("%Y-%m-") + "0" + str(d) else: created_at = datetime.today().strftime("%Y-%m-") + str(d) print(created_at) content = "" section_content = content_soup.find_all("div", {"class": "entry-content"}) for s in section_content: contents = s.find_all("p") for c in contents: content += c.text content = content.replace(u'\xa0',' ').replace('\t',' ').replace('<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") source = "dropbox" if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source, False) else: keyword_list = getKeywords(title.lower(), content.lower(), source, False) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" file.writerow([title, content[:200] + "...", link, 0, source, _keyword, "", created_at, 0]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([title, content[:200] + "...", link, 0, source, _keyword, "", created_at, 0]) updater.close()
def getData(file, link, multi): r = requests.get(link) content_source = r.text content_soup = BeautifulSoup(content_source, "lxml") title = content_soup.select_one('h1.entry-title').text.replace( u'\xa0', ' ').replace('\t', ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") created_at = content_soup.find("time")['datetime'].split("T")[0] print(created_at) if int(created_at.split('-')[0]) < 2013: print(created_at.split('-')[0]) return False content = "" section_content = content_soup.find_all("div", {"class": "main-content"}) for s in section_content: image = "" img = s.select_one('figure > img') if img is not None: if img['src'].startswith("/"): image = "https://alistapart.com" + img['src'] else: image = img['src'] print(image) contents = s.find_all("p") for c in contents: content += c.text content = content.replace(u'\xa0', ' ').replace('\t', ' ').replace( '<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") source = "alistapart" if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source, False) else: keyword_list = getKeywords(title.lower(), content.lower(), source, False) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" file.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater.close() return True
def getData(file, link, multi): r = requests.get(link) content_source = r.text content_soup = BeautifulSoup(content_source, "lxml") title = content_soup.find("h1", { "class": "entry-title" }).text.replace(u'\xa0', ' ').replace('\t', ' ').lstrip().rstrip().replace( "\"", "").replace("\'", "") created_at = content_soup.select_one('span.byline').text.replace(".", "-")[3:] print(created_at) content = "" section_content = content_soup.find_all( "div", {"class": "entry-content single-page"}) for s in section_content: image = "" img = s.find("img") if img is not None: if img['src'].startswith("/"): image = "https://engineering.linecorp.com/ko/blog" + img['src'] else: image = img['src'] print(image) contents = s.find_all("p") for c in contents: content += c.text content = content.replace(u'\xa0', ' ').replace('\t', ' ').replace( '<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") source = "line" if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source) else: keyword_list = getKeywords(title.lower(), content.lower(), source) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" file.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater.close()
def getData(file, link, multi): r = requests.get(link) content_source = r.text content_soup = BeautifulSoup(content_source, "lxml") try: title = content_soup.find('h1', { "class": "PostHeader-title" }).text.replace(u'\xa0', ' ').replace('\t', ' ').replace( '\n', ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") except: print("pass") return created_at = content_soup.find( "meta", property="article:published_time")['content'].split("T")[0] print(created_at) content = "" section_content = content_soup.find_all("section", {"class": "post-content"}) for s in section_content: image = "" img = s.find("img") if img is not None: if img['src'].startswith("/"): image = "https://blog.openai.com" + img['src'] else: image = img['src'] print(image) contents = s.find_all("p") for c in contents: content += c.text content = content.replace(u'\xa0', ' ').replace('\t', ' ').replace( '<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") source = "openai" if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source, False) else: keyword_list = getKeywords(title.lower(), content.lower(), source, False) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" file.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater.close()
def getData(file, link, multi): r = requests.get(link) content_source = r.text content_soup = BeautifulSoup(content_source, "lxml") title = content_soup.select_one('div#cover > div > h1').text.replace( u'\xa0', ' ').replace('\t', ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") created_at = content_soup.select_one('p#post-date').text[0:10] content = "" section_content = content_soup.find_all("div", {"id": "post-content"}) for s in section_content: image = "" img = s.find("img") if img is not None: if img['src'].startswith("/"): image = "http://tech.kakao.com" + img['src'] else: image = img['src'] print(image) contents = s.find_all("p") for c in contents: content += c.text content = content.replace(u'\xa0', ' ').replace('\t', ' ').replace( '<br>', ' ').replace("\n", ' ').lstrip().rstrip().replace("\"", "").replace("\'", "") source = "kakao" if multi: keyword_list = getKeywordsForMulti(title.lower(), content.lower(), source) else: keyword_list = getKeywords(title.lower(), content.lower(), source) if keyword_list: _keyword = ",".join(keyword_list) else: _keyword = "" file.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater = open(os.path.dirname(os.path.realpath(__file__)) + "/data/update.csv", "a", encoding='utf-8', newline='') update_writer = csv.writer(updater) update_writer.writerow([ title, content[:200] + "...", link, 0, source, _keyword, image, created_at, 0 ]) updater.close()