def main(): conn = http.client.HTTPConnection("www.olloo.mn") ids = None root = None w = Writer() def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: for (url, title) in get_urls(): try: source = Source(url, title=title, scraper=ScraperOlloo, conn=conn) #source.out_content = source.scraper.scraped() #print(source.out_content) source.makeRoot("./", ids=ids, root=root) source.add_to_archive() if ids is None: # ighf not ids: ids = source.ids if root is None: # if not root: root = source.root except Exception as e: sys.stdout.write(str(e)) except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") w.close() conn.close()
def main(): conn = http.client.HTTPConnection("altaicholmon.ru") ids = None root = None w = Writer() for i in range(0, getLastPage() + 2): populateArticleList(i + 1) try: for (url, title, date) in articles: try: source = Source(url, title=title, date=date, scraper=ScraperAltaicholmon, conn=conn) source.makeRoot("./", ids=ids, root=root, lang="alt") source.add_to_archive() if ids is None: ids = source.ids if root is None: root = source.root except Exception as e: print(url + " " + str(e)) except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") w.close() conn.close()
def tryOneArticle(url): global domain, siteLang root = None ids = None conn = http.client.HTTPConnection(domain) w = Writer() source = Source(url, title="", scraper=siteScraper, conn=conn) source.makeRoot("./", ids=ids, root=root, lang=siteLang) source.add_to_archive() w.close() conn.close()
def main(): global startyear, endyear, minweek, maxweek, domain, siteLang, siteScraper sys.stdout.write("\rGenerating urls...\n") sys.stdout.flush() allurls = get_allurls(startyear, endyear, minweek, maxweek) pass sys.stdout.write("\r%d articles total\n" % len(allurls)) conn = http.client.HTTPConnection(domain) ids = None root = None this = 0 w = Writer(5) def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: for (url, title) in allurls: #sys.stdout.write("\r"+url+" "+title+"\n") #sys.stdout.flush() this += 1 try: source = Source(url, title=title, scraper=siteScraper, conn=conn) source.makeRoot("./", ids=ids, root=root, lang=siteLang) msg = "(%s/%s)" % (this, len(allurls)) source.add_to_archive(msg=msg) if ids is None: # if not ids: ids = source.ids if root is None: # if not root: root = source.root except Exception as e: sys.stdout.write(" — %s \n" % e) sys.stdout.flush() raise except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") w.close() conn.close()
def main(numScrape): conn = http.client.HTTPConnection("www.chuvash.org") mainPage = getPage(conn, '') latestArticleNum = int( mainPage.xpath("//h2[@class='hipar_head']")[0][0].attrib['href'].split( 'news/')[1].replace('.html', '')) print('Scraping %s articles...' % ('all' if numScrape is -1 else numScrape)) numScraped = 0 attemptScrape = 0 i = latestArticleNum ids = None root = None w = Writer() def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: while i >= 1 and (numScraped < numScrape or numScrape is -1): try: url = "http://www.chuvash.org" + (urlTemplate % i) source = Source(url, scraper=ScraperChuvash, conn=conn) source.makeRoot("./", ids=ids, root=root, lang="cv") source.add_to_archive() if ids is None: ids = source.ids if root is None: root = source.root attemptScrape += 1 numScraped += 1 if source.out_content is not None and len( source.out_content) is 0: numScraped -= 1 except Exception as e: print(url + " " + str(e)) i -= 1 except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") print("Attempted to scrape %s articles." % attemptScrape) print("%s articles scraped." % numScraped) w.close() conn.close()
def main(startDate, endDate): print("Getting URLs from %s to %s..." % (startDate, endDate)) #inclusive of startDate but does not include endDate conn = http.client.HTTPConnection("archive.news.mn") populateArticlesList(conn) print("%s URLs scraped from %s to %s" % (str(len(articles)), startDate, endDate)) print("Scraping article content...") ids = None root = None scrapedNum = 0 w = Writer(10) def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: for (title, url) in articles: if url.find("video.news") + url.find("id.news") + url.find( "english.news") + url.find("photoalbum") is -4: try: source = Source(url, title=title, scraper=ScraperNewsmn, conn=conn) source.makeRoot("./", ids=ids, root=root, lang="khk") source.add_to_archive() if ids is None: ids = source.ids if root is None: root = source.root scrapedNum += 1 except Exception as e: print(url + " " + str(e)) print("%s articles scraped" % scrapedNum) except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") w.close() conn.close()
def main(startDate, endDate): print("Getting URLs from %s to %s and from %s pages..." % (startDate, endDate, numPages)) conn = http.client.HTTPConnection("www.radioerkinli.com") populateArticlesList(conn) #printArticles(articles, 'test2.txt', False) print("%s URLs scraped from %s to %s and from %s pages * %s categories" % (str(len(articles)), startDate, endDate, numPages, str(len(urlStructures) - 1))) print("Scraping article content...") ids = None root = None scrapedNum = 0 w = Writer() def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: for (title, url, date) in articles: try: source = Source(url, title=title, date=date, scraper=ScraperAzadliq, conn=conn) source.makeRoot("./", ids=ids, root=root, lang="ava") source.add_to_archive() if ids is None: ids = source.ids if root is None: root = source.root scrapedNum += 1 except Exception as e: print(url + " " + str(e)) except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") print("%s articles scraped" % scrapedNum) w.close() conn.close()
def main(): global startyear, endyear, minmonth, maxmonth sys.stdout.write("\rGenerating urls...\n") sys.stdout.flush() allurls = get_allurls(startyear, endyear, minmonth, maxmonth) sys.stdout.write("\r%d articles total\n" % len(allurls)) conn = http.client.HTTPConnection("www.azattyk.org") ids = None root = None w = Writer() def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: for (url, title) in allurls: #sys.stdout.write("\r"+url+" "+title+"\n") #sys.stdout.flush() try: source = Source(url, title=title, scraper=ScraperAzattyk, conn=conn) source.makeRoot("./", ids=ids, root=root) source.add_to_archive() if ids is None: # if not ids: ids = source.ids if root is None: # if not root: root = source.root except Exception as e: sys.stdout.write(str(e)) except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") w.close() conn.close()
def main(startDate, endDate): print("Getting URLs from %s to %s..." % (startDate, endDate)) #inclusive of both dates conn = http.client.HTTPConnection("www.azatutyun.am") populateArticlesList(conn) print("%s URLs scraped from %s to %s" % (str(len(articles)), startDate, endDate)) print("Scraping article content...") ids = None root = None scrapedNum = 0 w = Writer() def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: for (title, url, date) in articles: try: source = Source(url, title=title, date = date, scraper=ScraperAzatutyun, conn=conn) source.makeRoot("./", ids=ids, root=root, lang="hye") source.add_to_archive() if ids is None: ids = source.ids if root is None: root = source.root scrapedNum += 1 except Exception as e: print(url + " " + str(e)) print("%s articles scraped" % scrapedNum) except KeyboardInterrupt: print("\nReceived a keyboard interrupt. Closing the program.") w.close() conn.close()
if __name__ == "__main__": sys.stdout.write("Getting urls.") sys.stdout.flush() links = getPages() # for testing: #links=["http://kumukia.ru/adabiat/getpage.php?search=workpage&work=39&page=27", "http://kumukia.ru/adabiat/getpage.php?search=workpage&work=8&page=1"] #links=["http://kumukia.ru/adabiat/getpage.php?search=workpage&work=46&page=1", "http://kumukia.ru/adabiat/getpage.php?search=workpage&work=63&page=1"] #print("\n".join(links)) conn = http.client.HTTPConnection(domainReal) ids = None root = None this = 0 w = Writer() def term_handler(sigNum, frame): print("\nReceived a SIGTERM signal. Closing the program.") w.close() sys.exit(0) signal.signal(signal.SIGTERM, term_handler) try: #for (url, title) in allurls: for link in links: #sys.stdout.write("\r"+url+" "+title+"\n") #sys.stdout.flush() this += 1 try: