Пример #1
0
def main():
    conn = http.client.HTTPConnection("www.olloo.mn")
    ids = None
    root = None
    w = Writer()

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (url, title) in get_urls():
            try:
                source = Source(url,
                                title=title,
                                scraper=ScraperOlloo,
                                conn=conn)
                #source.out_content = source.scraper.scraped()
                #print(source.out_content)
                source.makeRoot("./", ids=ids, root=root)
                source.add_to_archive()
                if ids is None:  # ighf not ids:
                    ids = source.ids
                if root is None:  # if not root:
                    root = source.root
            except Exception as e:
                sys.stdout.write(str(e))
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
Пример #2
0
def main():
    conn = http.client.HTTPConnection("altaicholmon.ru")
    ids = None
    root = None
    w = Writer()

    for i in range(0, getLastPage() + 2):
        populateArticleList(i + 1)
    try:
        for (url, title, date) in articles:
            try:
                source = Source(url,
                                title=title,
                                date=date,
                                scraper=ScraperAltaicholmon,
                                conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang="alt")
                source.add_to_archive()
                if ids is None:
                    ids = source.ids
                if root is None:
                    root = source.root
            except Exception as e:
                print(url + " " + str(e))
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
Пример #3
0
def tryOneArticle(url):
    global domain, siteLang
    root = None
    ids = None
    conn = http.client.HTTPConnection(domain)
    w = Writer()
    source = Source(url, title="", scraper=siteScraper, conn=conn)
    source.makeRoot("./", ids=ids, root=root, lang=siteLang)
    source.add_to_archive()
    w.close()
    conn.close()
Пример #4
0
def main():
    global startyear, endyear, minweek, maxweek, domain, siteLang, siteScraper

    sys.stdout.write("\rGenerating urls...\n")
    sys.stdout.flush()
    allurls = get_allurls(startyear, endyear, minweek, maxweek)
    pass

    sys.stdout.write("\r%d articles total\n" % len(allurls))

    conn = http.client.HTTPConnection(domain)

    ids = None
    root = None
    this = 0
    w = Writer(5)

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (url, title) in allurls:
            #sys.stdout.write("\r"+url+" "+title+"\n")
            #sys.stdout.flush()
            this += 1
            try:
                source = Source(url,
                                title=title,
                                scraper=siteScraper,
                                conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang=siteLang)
                msg = "(%s/%s)" % (this, len(allurls))
                source.add_to_archive(msg=msg)
                if ids is None:  # if not ids:
                    ids = source.ids
                if root is None:  # if not root:
                    root = source.root

            except Exception as e:
                sys.stdout.write(" — %s \n" % e)
                sys.stdout.flush()
                raise
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
Пример #5
0
def main(numScrape):
    conn = http.client.HTTPConnection("www.chuvash.org")
    mainPage = getPage(conn, '')
    latestArticleNum = int(
        mainPage.xpath("//h2[@class='hipar_head']")[0][0].attrib['href'].split(
            'news/')[1].replace('.html', ''))
    print('Scraping %s articles...' %
          ('all' if numScrape is -1 else numScrape))
    numScraped = 0
    attemptScrape = 0
    i = latestArticleNum
    ids = None
    root = None
    w = Writer()

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        while i >= 1 and (numScraped < numScrape or numScrape is -1):
            try:
                url = "http://www.chuvash.org" + (urlTemplate % i)
                source = Source(url, scraper=ScraperChuvash, conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang="cv")
                source.add_to_archive()
                if ids is None:
                    ids = source.ids
                if root is None:
                    root = source.root
                attemptScrape += 1
                numScraped += 1
                if source.out_content is not None and len(
                        source.out_content) is 0:
                    numScraped -= 1
            except Exception as e:
                print(url + " " + str(e))
            i -= 1
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    print("Attempted to scrape %s articles." % attemptScrape)
    print("%s articles scraped." % numScraped)
    w.close()
    conn.close()
def main(startDate, endDate):
    print("Getting URLs from %s to %s..." %
          (startDate,
           endDate))  #inclusive of startDate but does not include endDate
    conn = http.client.HTTPConnection("archive.news.mn")
    populateArticlesList(conn)
    print("%s URLs scraped from %s to %s" %
          (str(len(articles)), startDate, endDate))
    print("Scraping article content...")
    ids = None
    root = None
    scrapedNum = 0
    w = Writer(10)

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (title, url) in articles:
            if url.find("video.news") + url.find("id.news") + url.find(
                    "english.news") + url.find("photoalbum") is -4:
                try:
                    source = Source(url,
                                    title=title,
                                    scraper=ScraperNewsmn,
                                    conn=conn)
                    source.makeRoot("./", ids=ids, root=root, lang="khk")
                    source.add_to_archive()
                    if ids is None:
                        ids = source.ids
                    if root is None:
                        root = source.root
                    scrapedNum += 1
                except Exception as e:
                    print(url + " " + str(e))
        print("%s articles scraped" % scrapedNum)

    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    w.close()
    conn.close()
def main(startDate, endDate):
    print("Getting URLs from %s to %s and from %s pages..." %
          (startDate, endDate, numPages))
    conn = http.client.HTTPConnection("www.radioerkinli.com")
    populateArticlesList(conn)
    #printArticles(articles, 'test2.txt', False)
    print("%s URLs scraped from %s to %s and from %s pages * %s categories" %
          (str(len(articles)), startDate, endDate, numPages,
           str(len(urlStructures) - 1)))
    print("Scraping article content...")
    ids = None
    root = None
    scrapedNum = 0
    w = Writer()

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        for (title, url, date) in articles:
            try:
                source = Source(url,
                                title=title,
                                date=date,
                                scraper=ScraperAzadliq,
                                conn=conn)
                source.makeRoot("./", ids=ids, root=root, lang="ava")
                source.add_to_archive()
                if ids is None:
                    ids = source.ids
                if root is None:
                    root = source.root
                scrapedNum += 1
            except Exception as e:
                print(url + " " + str(e))
    except KeyboardInterrupt:
        print("\nReceived a keyboard interrupt. Closing the program.")
    print("%s articles scraped" % scrapedNum)
    w.close()
    conn.close()
def main():
	global startyear, endyear, minmonth, maxmonth

	sys.stdout.write("\rGenerating urls...\n")
	sys.stdout.flush()
	allurls = get_allurls(startyear, endyear, minmonth, maxmonth)

	sys.stdout.write("\r%d articles total\n" % len(allurls))

	conn = http.client.HTTPConnection("www.azattyk.org")

	ids = None
	root = None
	w = Writer()

	def term_handler(sigNum, frame):
		print("\nReceived a SIGTERM signal. Closing the program.")
		w.close()
		sys.exit(0)

	signal.signal(signal.SIGTERM, term_handler)
	
	try:
		for (url, title) in allurls:
			#sys.stdout.write("\r"+url+" "+title+"\n")
			#sys.stdout.flush()

			try:
				source = Source(url, title=title, scraper=ScraperAzattyk, conn=conn)
				source.makeRoot("./", ids=ids, root=root)
				source.add_to_archive()
				if ids is None:   # if not ids:
					ids = source.ids
				if root is None:  # if not root:
					root = source.root

			except Exception as e:
				sys.stdout.write(str(e))
	except KeyboardInterrupt:
		print("\nReceived a keyboard interrupt. Closing the program.")
	w.close()	
	conn.close()
def main(startDate, endDate):
	print("Getting URLs from %s to %s..." % (startDate, endDate)) #inclusive of both dates
	conn = http.client.HTTPConnection("www.azatutyun.am")
	populateArticlesList(conn)
	print("%s URLs scraped from %s to %s" % (str(len(articles)), startDate, endDate))
	print("Scraping article content...")
	ids = None
	root = None
	scrapedNum = 0
	w = Writer()

	def term_handler(sigNum, frame):
		print("\nReceived a SIGTERM signal. Closing the program.")
		w.close()
		sys.exit(0)

	signal.signal(signal.SIGTERM, term_handler)
	
	try:
		for (title, url, date) in articles:
			try:
				source = Source(url, title=title, date = date, scraper=ScraperAzatutyun, conn=conn)
				source.makeRoot("./", ids=ids, root=root, lang="hye")
				source.add_to_archive()
				if ids is None:
					ids = source.ids
				if root is None:
					root = source.root
				scrapedNum += 1
			except Exception as e:
				print(url + " " + str(e))			
		print("%s articles scraped" % scrapedNum)

	except KeyboardInterrupt:
		print("\nReceived a keyboard interrupt. Closing the program.")
	w.close()
	conn.close()
Пример #10
0
if __name__ == "__main__":

    sys.stdout.write("Getting urls.")
    sys.stdout.flush()

    links = getPages()
    # for testing:
    #links=["http://kumukia.ru/adabiat/getpage.php?search=workpage&work=39&page=27", "http://kumukia.ru/adabiat/getpage.php?search=workpage&work=8&page=1"]
    #links=["http://kumukia.ru/adabiat/getpage.php?search=workpage&work=46&page=1", "http://kumukia.ru/adabiat/getpage.php?search=workpage&work=63&page=1"]
    #print("\n".join(links))

    conn = http.client.HTTPConnection(domainReal)
    ids = None
    root = None
    this = 0
    w = Writer()

    def term_handler(sigNum, frame):
        print("\nReceived a SIGTERM signal. Closing the program.")
        w.close()
        sys.exit(0)

    signal.signal(signal.SIGTERM, term_handler)

    try:
        #for (url, title) in allurls:
        for link in links:
            #sys.stdout.write("\r"+url+" "+title+"\n")
            #sys.stdout.flush()
            this += 1
            try: