Пример #1
0
def main():
    conn = http.client.HTTPConnection("www.bolod.mn")
    ids = None
    root = None
    w = Writer(5)

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (url, title) in get_urls():
            try:
                source = Source(url,
                                title=title,
                                scraper=ScraperBolod,
                                conn=conn)
                #source.out_content = source.scraper.scraped()
                #print(source.out_content)
                source.makeRoot("./", ids=ids, root=root)
                source.add_to_archive()
                if ids is None:  # ighf not ids:
                    ids = source.ids
                if root is None:  # if not root:
                    root = source.root
            except Exception as e:
                sys.stdout.write(str(e))
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
Пример #2
0
def main():
    conn = http.client.HTTPConnection("khakaschiry.ru")
    ids = None
    root = None
    w = Writer()

    for i in range(0, getLastPage()):
        populateArticleList(i + 1)

    try:
        for (url, title, date) in articles:
            try:
                source = Source(url,
                                title=title,
                                date=date,
                                scraper=ScraperKhakaschiry,
                                conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang="kjh")
                source.add_to_archive()
                if ids is None:
                    ids = source.ids
                if root is None:
                    root = source.root
            except Exception as e:
                print(url + " " + str(e))
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
Пример #3
0
def main():
	global startyear, endyear, minmonth, maxmonth

	sys.stdout.write("\rGenerating urls...\n")
	sys.stdout.flush()
	allurls = get_allurls(startyear, endyear, minmonth, maxmonth)

	sys.stdout.write("\r%d articles total\n" % len(allurls))

	conn = http.client.HTTPConnection("www.azattyk.org")

	ids = None
	root = None
	for (url, title) in allurls:
		#sys.stdout.write("\r"+url+" "+title+"\n")
		#sys.stdout.flush()

		try:
			source = Source(url, title=title, scraper=ScraperAzattyk, conn=conn)
			source.makeRoot("./", ids=ids, root=root)
			source.add_to_archive()
			if ids is None:   # if not ids:
				ids = source.ids
			if root is None:  # if not root:
				root = source.root

		except Exception as e:
			sys.stdout.write(str(e))
	
	conn.close()
Пример #4
0
def tryOneArticle(url):
    global domain, siteLang
    root = None
    ids = None
    conn = http.client.HTTPConnection(domain)
    w = Writer()
    source = Source(url, title="", scraper=siteScraper, conn=conn)
    source.makeRoot("./", ids=ids, root=root, lang=siteLang)
    source.add_to_archive()
    w.close()
    conn.close()
Пример #5
0
def main():
    global startyear, endyear, minweek, maxweek, domain, siteLang, siteScraper

    sys.stdout.write("\rGenerating urls...\n")
    sys.stdout.flush()
    allurls = get_allurls(startyear, endyear, minweek, maxweek)
    pass

    sys.stdout.write("\r%d articles total\n" % len(allurls))

    conn = http.client.HTTPConnection(domain)

    ids = None
    root = None
    this = 0
    w = Writer(5)

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (url, title) in allurls:
            #sys.stdout.write("\r"+url+" "+title+"\n")
            #sys.stdout.flush()
            this += 1
            try:
                source = Source(url,
                                title=title,
                                scraper=siteScraper,
                                conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang=siteLang)
                msg = "(%s/%s)" % (this, len(allurls))
                source.add_to_archive(msg=msg)
                if ids is None:  # if not ids:
                    ids = source.ids
                if root is None:  # if not root:
                    root = source.root

            except Exception as e:
                sys.stdout.write(" — %s \n" % e)
                sys.stdout.flush()
                raise
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
Пример #6
0
def main(numScrape):
    conn = http.client.HTTPConnection("www.chuvash.org")
    mainPage = getPage(conn, '')
    latestArticleNum = int(
        mainPage.xpath("//h2[@class='hipar_head']")[0][0].attrib['href'].split(
            'news/')[1].replace('.html', ''))
    print('Scraping %s articles...' %
          ('all' if numScrape is -1 else numScrape))
    numScraped = 0
    attemptScrape = 0
    i = latestArticleNum
    ids = None
    root = None
    w = Writer()

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        while i >= 1 and (numScraped < numScrape or numScrape is -1):
            try:
                url = "http://www.chuvash.org" + (urlTemplate % i)
                source = Source(url, scraper=ScraperChuvash, conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang="cv")
                source.add_to_archive()
                if ids is None:
                    ids = source.ids
                if root is None:
                    root = source.root
                attemptScrape += 1
                numScraped += 1
                if source.out_content is not None and len(
                        source.out_content) is 0:
                    numScraped -= 1
            except Exception as e:
                print(url + " " + str(e))
            i -= 1
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    print("Attempted to scrape %s articles." % attemptScrape)
    print("%s articles scraped." % numScraped)
    w.close()
    conn.close()
def main(startDate, endDate):
    print("Getting URLs from %s to %s..." %
          (startDate,
           endDate))  #inclusive of startDate but does not include endDate
    conn = http.client.HTTPConnection("archive.news.mn")
    populateArticlesList(conn)
    print("%s URLs scraped from %s to %s" %
          (str(len(articles)), startDate, endDate))
    print("Scraping article content...")
    ids = None
    root = None
    scrapedNum = 0
    w = Writer(10)

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (title, url) in articles:
            if url.find("video.news") + url.find("id.news") + url.find(
                    "english.news") + url.find("photoalbum") is -4:
                try:
                    source = Source(url,
                                    title=title,
                                    scraper=ScraperNewsmn,
                                    conn=conn)
                    source.makeRoot("./", ids=ids, root=root, lang="khk")
                    source.add_to_archive()
                    if ids is None:
                        ids = source.ids
                    if root is None:
                        root = source.root
                    scrapedNum += 1
                except Exception as e:
                    print(url + " " + str(e))
        print("%s articles scraped" % scrapedNum)

    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
def main(startDate, endDate):
    print("Getting URLs from %s to %s and from %s pages..." %
          (startDate, endDate, numPages))
    conn = http.client.HTTPConnection("www.radioerkinli.com")
    populateArticlesList(conn)
    #printArticles(articles, 'test2.txt', False)
    print("%s URLs scraped from %s to %s and from %s pages * %s categories" %
          (str(len(articles)), startDate, endDate, numPages,
           str(len(urlStructures) - 1)))
    print("Scraping article content...")
    ids = None
    root = None
    scrapedNum = 0
    w = Writer()

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (title, url, date) in articles:
            try:
                source = Source(url,
                                title=title,
                                date=date,
                                scraper=ScraperAzadliq,
                                conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang="ava")
                source.add_to_archive()
                if ids is None:
                    ids = source.ids
                if root is None:
                    root = source.root
                scrapedNum += 1
            except Exception as e:
                print(url + " " + str(e))
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    print("%s articles scraped" % scrapedNum)
    w.close()
    conn.close()
def main():
	global startyear, endyear, minmonth, maxmonth

	sys.stdout.write("\rGenerating urls...\n")
	sys.stdout.flush()
	allurls = get_allurls(startyear, endyear, minmonth, maxmonth)

	sys.stdout.write("\r%d articles total\n" % len(allurls))

	conn = http.client.HTTPConnection("www.azattyk.org")

	ids = None
	root = None
	w = Writer()

	def term_handler(sigNum, frame):
		print("\nReceived a SIGTERM signal. Closing the program.")
		w.close()
		sys.exit(0)

	signal.signal(signal.SIGTERM, term_handler)
	
	try:
		for (url, title) in allurls:
			#sys.stdout.write("\r"+url+" "+title+"\n")
			#sys.stdout.flush()

			try:
				source = Source(url, title=title, scraper=ScraperAzattyk, conn=conn)
				source.makeRoot("./", ids=ids, root=root)
				source.add_to_archive()
				if ids is None:   # if not ids:
					ids = source.ids
				if root is None:  # if not root:
					root = source.root

			except Exception as e:
				sys.stdout.write(str(e))
	except KeyboardInterrupt:
		print("\nReceived a keyboard interrupt. Closing the program.")
	w.close()	
	conn.close()
def main(startDate, endDate):
	print("Getting URLs from %s to %s..." % (startDate, endDate)) #inclusive of both dates
	conn = http.client.HTTPConnection("www.azatutyun.am")
	populateArticlesList(conn)
	print("%s URLs scraped from %s to %s" % (str(len(articles)), startDate, endDate))
	print("Scraping article content...")
	ids = None
	root = None
	scrapedNum = 0
	w = Writer()

	def term_handler(sigNum, frame):
		print("\nReceived a SIGTERM signal. Closing the program.")
		w.close()
		sys.exit(0)

	signal.signal(signal.SIGTERM, term_handler)
	
	try:
		for (title, url, date) in articles:
			try:
				source = Source(url, title=title, date = date, scraper=ScraperAzatutyun, conn=conn)
				source.makeRoot("./", ids=ids, root=root, lang="hye")
				source.add_to_archive()
				if ids is None:
					ids = source.ids
				if root is None:
					root = source.root
				scrapedNum += 1
			except Exception as e:
				print(url + " " + str(e))			
		print("%s articles scraped" % scrapedNum)

	except KeyboardInterrupt:
		print("\nReceived a keyboard interrupt. Closing the program.")
	w.close()
	conn.close()
Пример #11
0
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        #for (url, title) in allurls:
        for link in links:
            #sys.stdout.write("\r"+url+" "+title+"\n")
            #sys.stdout.flush()
            this += 1
            try:
                #linkies = link.split('.')
                #url = linkies[0]+"."+urllib.parse.quote(linkies[1], encoding="utf8")
                #print(url)
                url = link
                source = Source(url, scraper=siteScraper, conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang=siteLang)
                msg = "(%s/%s)" % (this, len(links))
                source.add_to_archive(msg=msg)
                if ids is None:  # if not ids:
                    ids = source.ids
                if root is None:  # if not root:
                    root = source.root
            except Exception as e:
                sys.stdout.write(" — %s \n" % e)
                sys.stdout.flush()
                raise
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()