Exemplo n.º 1
0
def main():
	torStatus = checkCronJob.checkCronStatus(os.getcwd()+"/checkTorStartStatus.txt")
	print torStatus
	if (torStatus):
		 start()
	else:
		print "Tor is already active"
	TorFirefox.startDisplay(display)
	browser = TorFirefox.getFirefoxBrowser()

	TorFirefox.closeBrowser(browser)
	TorFirefox.closeDisplay(display)
Exemplo n.º 2
0
def main():
    now = datetime.datetime.now()
    print "* Time of RUN : ", now

    if (datetime.datetime.today().hour < 12):
        print "This script is scheduled to run only after 1200hrs; "
        print "change it in the code if you want otherwise"
        print "------------------------SLEEP TIME-------------------------------"
        return

    ##Starting TOR coment this part if u can start tor manually or to debug
    torStatus = checkCronJob.checkCronStatus(os.getcwd() +
                                             "/checkTorStartStatus.txt")
    if (torStatus):
        startTor.start()
    else:
        print "Tor is already active"

    # status = checkCronJob.checkCronStatus("/home/gugli/Documents/script_py/Dainik_Jagron/checkTorWatchdog.txt")
    # start_tor_watchdog(status)
    hack_paper()

    hindustan.Hindustan()
Exemplo n.º 3
0
def hack_paper():

	status = checkCronJob.checkCronStatus()
	print status
	if(status == 0):
		print "JOB's already done"
		return


	todaysDate = Cur_date.getCurDate()

	pdf_docs = []
	pages = get_pages.getPages()+1

	# pages = 2

	dir_path = os.path.dirname(os.path.realpath(__file__))

	for pageno in xrange(1,pages):
		
		
		for city in ['smt','mdb','bgh']:
			url = "http://epaper.jagran.com/epaperimages/"+todaysDate+"/muzaffarpur/"+str(Cur_date.getPrevDayDate())+city+"-pg"+ str(pageno) +"-0.pdf"
			# url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf"

			print url

			##sending file path
			## file path also contains the file name of the downloaded file
			file_path = dir_path + "/" + str(pageno)+".pdf"
			print "Downloading...page no = ", pageno

			download.download_file(url,file_path)

			
			flag = pdf_merger.check_valid_pdf(file_path)
			if(flag == 0):
				pdf_docs.append(file_path)
				break #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip
			else:
				os.remove(file_path)
				print "PAGE NO",pageno,"with city =", city, "DONT EXIST"
			
			
		# pdf_docs.append(file_path)



	final_file_path = dir_path + "/" + todaysDate+".pdf"
	pdf_merger.FileMerger(pdf_docs, final_file_path)


	subject = "epaper dated "+ todaysDate

	# file_path = dir_path + "/" + final_file_name

	###for qpython -- files download in this directory
	# cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
	# file_path = cd_dir_path + "/" + final_file_name

	try:
		print "SENDING EMAIL..............."
		send_email.send_mail(configg.fromaddr,configg.password,configg.toaddr,subject,todaysDate+".pdf",final_file_path)

		pdf_docs.append(final_file_path)
		Delete_Files.del_files(pdf_docs)

		##updating cron Flag file when the job is done for the day

		with open('/home/gugli/Documents/script_py/Dainik_Jagron/checkCronStatus.txt','w') as outFile:
			outFile.write( Cur_date.strfTime())

		
	except Exception as e:
		Delete_Files.del_files(pdf_docs)
		print "COULDNOT SEND MAIL...."
		print e
Exemplo n.º 4
0
def Hindustan():
    status = checkCronJob.checkCronStatus(os.getcwd() +
                                          "/checkCronStatusH.txt")
    # print status

    if (status == 0):
        print "[[[[[[==Hindustans' JOB's already done==]]]]]]"
        return
    print "==================HINDUSTAN====================="
    ### used ctrl+u to get the javascript of the webpage
    #===================================================================
    ##only done for pdfcomressor; Not used for dainiks' website; used urllib2
    TorFirefox.startDisplay(display)
    browser = TorFirefox.getFirefoxBrowser()
    url = 'http://pdfcompressor.com/'
    browser.get(url)
    #===================================================================
    dir_path = os.path.dirname(os.path.realpath(__file__))
    todaysDate = Cur_date.strfTime()

    totalPages = get_pages.getPagesHindustan()
    print type(totalPages), totalPages

    # totalPages = 20
    for pageno in xrange(1, totalPages + 1):

        # for pageno in xrange(1,5):

        url = "http://epaper.livehindustan.com/epaper/Bihar/Sitamarhi/" + todaysDate + "/108/Page-" + str(
            pageno) + ".html"
        # url = "http://epaper.livehindustan.com/epaper/Bihar/Sitamarhi/2018-05-04/108/Page-"+ str(pageno)+".html"
        print url
        html_page = urllib2.urlopen(url)

        soup = BeautifulSoup(html_page, "html.parser")
        try:
            src = soup.find('img',
                            {'id': 'imgpage_' + str(pageno)})['src'].replace(
                                'll.jpg', '.pdf').replace('s.png',
                                                          '.pdf').replace(
                                                              '.jpg', '.pdf')

        except Exception as e:
            print e
            print "+++++++++++++++++++IMG tag with id imgpage not found +++++++++++++++++++++"
            continue

        # print src
        ### they have removed below junk part .. well its good but I had to change the code
        # pdf_url = src.replace("../../../../..","http://epaper.livehindustan.com")
        pdf_url = "http://epaper.livehindustan.com" + src
        print pdf_url
        #this filepath is uploaded on the Compression website
        file_path_del = dir_path + "/h" + str(pageno) + ".pdf"
        #this variable stores the file name of the compressed file yet to be downloaded
        file_path_merge = dir_path + "/h" + str(pageno) + "-min" + ".pdf"

        ## downloads only if the file is not present
        if (not os.path.isfile(file_path_merge)):
            print "Downloading...page no = ", pageno
            ## this fn ret value that ensures if we have got valid pdf, if not loop continues
            flag = download.download_file2(pdf_url, file_path_del, browser)
            # time.sleep(2)
        else:
            print "FILE is already present"

        ## check valid pdf
        if (os.path.isfile(file_path_merge) or flag == 0):
            pdf_docs_del.append(file_path_del)
            pdf_docs_merge.append(file_path_merge)
        else:
            # os.remove(file_path)
            print "-------------PAGE NO ", pageno, "DONT EXIST ----------------------------"
            continue
        if (pageno == 20):
            print "    ==== reached page no 20 ====="
            flag1 = PdfCompressor.downloadCompPDF(browser)
            if (flag1):  #flag1 =0 when there is no fle to be downloaded
                print "  ===Unzipping first 20 files===="
                PdfCompressor.unzipFile(os.getcwd() + "/pdfcompressor.zip",
                                        os.getcwd())
                os.remove(os.getcwd() + "/pdfcompressor.zip")
                PdfCompressor.reset_all(browser)

    ##Downloading all compressed file in zip format

    flag1 = PdfCompressor.downloadCompPDF(browser)
    ## and unzipping it in the current folder
    # print os.getcwd()
    if (flag1):  #flag1 =0 when there is no fle to be downloaded
        PdfCompressor.unzipFile(os.getcwd() + "/pdfcompressor.zip",
                                os.getcwd())
        os.remove(os.getcwd() + "/pdfcompressor.zip")

    ## Merging all the individual files
    final_file_path = dir_path + "/h" + todaysDate + ".pdf"
    # print pdf_docs_merge
    # print final_file_path
    pdf_merger.FileMerger(pdf_docs_merge, final_file_path)

    #close the browser and display after doint all the shit
    TorFirefox.closeBrowser(browser)
    TorFirefox.closeDisplay(display)
    ##==========================================================================================
    ##we will now check if the file size is under limit or not ( < 25 MB for attachment in gmail)
    checkSizeFlag = checkFileSize.check(final_file_path)
    k = 1

    while checkSizeFlag:  # true is returned if size>25
        os.remove(
            final_file_path
        )  # have to remove becoz pdf_merger only merges if the file dont exist

        pdf_merger.FileMerger(pdf_docs_merge[:-k], final_file_path)
        checkSizeFlag = checkFileSize.check(final_file_path)
        print "++++++++++ Removed last %s" % (k), 'file +++++++++++++'
        k = k + 1

    pdf_docs_merge.append(final_file_path)
    ##==========================================================================================

    ##==========================================================================================
    ## now we are ready to send email
    ##Hindi text
    akhbaar = u'\u0905' + u'\u0916' + u'\u093c' + u'\u092c' + u'\u093e' + u'\u0930'
    dinakit = u'\u0926' + u'\u093f' + u'\u0928' + u'\u093e' + u'\u0902' + u'\u0915' + u'\u093f' + u'\u0924'
    # print akhbaar +' ' +dinakit

    subject = "Hindustan " + akhbaar + ' ' + dinakit + ' ' + todaysDate

    try:
        ## done in case when the more than one scripts run simultaneously.
        ##this can happen when network is slow or the script 1 is taking long enough time to execute
        print "Checking if mail is already sent ..... "
        status = checkCronJob.checkCronStatus(os.getcwd() +
                                              "/checkCronStatusH.txt")
        print status
        if (status == 0):
            print "Mail has been sent already..."
            delete_files(pdf_docs_del, pdf_docs_merge)
            return
        print "SENDING EMAIL..............."
        send_email.send_mail(configg.fromaddr, configg.password,
                             configg.toaddr, subject, todaysDate + ".pdf",
                             final_file_path)
        delete_files(pdf_docs_del, pdf_docs_merge)
        # Delete_Files.del_files(pdf_docs_merge)
        # Delete_Files.del_files(pdf_docs_del)
        # print "FILES DELETED"
        ##updating cron Flag file when the job is done for the day

        with open(os.getcwd() + '/checkCronStatusH.txt', 'w') as outFile:
            outFile.write(Cur_date.strfTime())

    except Exception as e:
        # Delete_Files.del_files(pdf_docs)
        print "COULDNOT SEND MAIL...."
        print e
Exemplo n.º 5
0
def hack_paper():

    status = checkCronJob.checkCronStatus(os.getcwd() + "/checkCronStatus.txt")
    # print status

    if (status == 0):
        # print "Dainiks' JOB's already done"
        print "[[[[[[==Dainiks' JOB's already done==]]]]]]"
        return

    todaysDate = Cur_date.getCurDate()

    pdf_docs = []
    pdf_docs_merge = [
    ]  ## this will store the file_path of the files to be merged and then deleted

    pages = get_pages.getPages()

    # pages = 2

    dir_path = os.path.dirname(os.path.realpath(__file__))
    #===================================================================
    TorFirefox.startDisplay(display)
    browser = TorFirefox.getFirefoxBrowser()
    url = 'http://pdfcompressor.com/'
    browser.get(url)
    #===================================================================

    for pageno in xrange(1,
                         pages + 1):  #because for loop excludes last element

        for city in ['smt', 'mdb', 'bgh']:
            url = "http://epaper.jagran.com/epaperimages/" + todaysDate + "/muzaffarpur/" + str(
                Cur_date.getPrevDayDate()) + city + "-pg" + str(
                    pageno) + "-0.pdf"
            # url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf"

            print url

            ##sending file path
            ## file path also contains the file name of the downloaded file
            file_path = dir_path + "/" + str(pageno) + ".pdf"
            #this variable stores the file name of the compressed file yet to be downloaded
            file_path_merge = dir_path + "/" + str(pageno) + "-min" + ".pdf"

            if (not os.path.isfile(file_path_merge)):
                print "Downloading...page no = ", pageno
                ## this fn ret value that ensures if we have got valid pdf, if not loop continues
                flag = download.download_file2(url, file_path, browser)
                time.sleep(2)
            else:
                print "FILE is already present"

            # flag = pdf_merger.check_valid_pdf(file_path)
            ##if file is present move on else check the flag val. Note the flag is assigned if cond-1 is flase
            if (os.path.isfile(file_path_merge) or flag == 0):
                pdf_docs.append(file_path)
                pdf_docs_merge.append(file_path_merge)
                break  #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip
            else:
                # os.remove(file_path)
                print "PAGE NO", pageno, "with city =", city, "DONT EXIST"
                continue
            ###In case no of pages is more than 20(pdfcompressor.com takes 20 files at max)
            ### dowload, unzip the uploaded files and then reset it.
        if (pageno == 20):
            print "    ==== reached page no 20 ====="
            flag1 = PdfCompressor.downloadCompPDF(browser)
            if (flag1):  #flag1 =0 when there is no fle to be downloaded
                print "  ===Unzipping first 20 files===="

                PdfCompressor.unzipFile(os.getcwd() + "/pdfcompressor.zip",
                                        os.getcwd())
                os.remove(os.getcwd() + "/pdfcompressor.zip")
                ##Resetting the browser to upload other files
                PdfCompressor.reset_all(browser)

    flag1 = PdfCompressor.downloadCompPDF(browser)
    if (flag1):  #flag1 =0 when there is no fle to be downloaded
        PdfCompressor.unzipFile(os.getcwd() + "/pdfcompressor.zip",
                                os.getcwd())
        os.remove(os.getcwd() + "/pdfcompressor.zip")

    # print pdf_docs
    final_file_path = dir_path + "/" + todaysDate + ".pdf"
    pdf_merger.FileMerger(pdf_docs_merge, final_file_path)

    TorFirefox.closeBrowser(browser)
    TorFirefox.closeDisplay(display)

    # final_file_path_new = dir_path + "/" + todaysDate+"-min.pdf" ## there is addition of '-min' on downloading compressed fle
    ## if the compression dont compress to the required size than the last option is to eliminate some files;
    checkSizeFlag = checkFileSize.check(final_file_path)
    k = 1

    while checkSizeFlag:
        os.remove(
            final_file_path
        )  # have to remove becoz pdf_merger only merges if the file dont exist

        pdf_merger.FileMerger(pdf_docs[:-k], final_file_path)
        checkSizeFlag = checkFileSize.check(final_file_path)
        print "++++++++++ Removed last %s" % (k), 'file +++++++++++++'
        k = k + 1
    pdf_docs.append(final_file_path)
    ##=======================================================================================================

    ##Hindi text
    akhbaar = u'\u0905' + u'\u0916' + u'\u093c' + u'\u092c' + u'\u093e' + u'\u0930'
    dinakit = u'\u0926' + u'\u093f' + u'\u0928' + u'\u093e' + u'\u0902' + u'\u0915' + u'\u093f' + u'\u0924'
    # print akhbaar +' ' +dinakit

    subject = "Dainik Jagron " + akhbaar + ' ' + dinakit + ' ' + todaysDate

    # file_path = dir_path + "/" + final_file_name

    ###for qpython -- files download in this directory
    # cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    # file_path = cd_dir_path + "/" + final_file_name

    try:
        ## done in case when the more than one scripts run simultaneously.
        ##this can happen when network is slow or the script 1 is taking long enough time to execute
        print "Checking if mail is already sent ..... "
        status = checkCronJob.checkCronStatus(os.getcwd() +
                                              "/checkCronStatus.txt")
        print status
        if (status == 0):
            print "Mail has been sent already..."
            delete_files(pdf_docs, pdf_docs_merge)
            return
        print "SENDING EMAIL..............."
        send_email.send_mail(configg.fromaddr, configg.password,
                             configg.toaddr, subject, todaysDate + ".pdf",
                             final_file_path)

        delete_files(pdf_docs, pdf_docs_merge)
        # Delete_Files.del_files(pdf_docs)
        # Delete_Files.del_files(pdf_docs_merge)

        ##updating cron Flag file when the job is done for the day
        with open(os.getcwd() + '/checkCronStatus.txt', 'w') as outFile:
            outFile.write(Cur_date.strfTime())

    except Exception as e:
        # Delete_Files.del_files(pdf_docs)
        print "COULDNOT SEND MAIL...."
        print e
Exemplo n.º 6
0
def hack_paper():

    status = checkCronJob.checkCronStatus()
    # print status

    if (status == 0):
        print "JOB's already done"
        return

    todaysDate = Cur_date.getCurDate()

    pdf_docs = []
    pages = get_pages.getPages() + 1

    # pages = 2

    dir_path = os.path.dirname(os.path.realpath(__file__))

    for pageno in xrange(1, pages):

        for city in ['smt', 'mdb', 'bgh']:
            url = "http://epaper.jagran.com/epaperimages/" + todaysDate + "/muzaffarpur/" + str(
                Cur_date.getPrevDayDate()) + city + "-pg" + str(
                    pageno) + "-0.pdf"
            # url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf"

            print url

            ##sending file path
            ## file path also contains the file name of the downloaded file
            file_path = dir_path + "/" + str(pageno) + ".pdf"
            print "Downloading...page no = ", pageno

            ## this fn ret value that ensures if we have got valid pdf, if not loop continues
            flag = download.download_file(url, file_path)

            # flag = pdf_merger.check_valid_pdf(file_path)

            if (flag == 0):
                pdf_docs.append(file_path)
                break  #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip
            else:
                # os.remove(file_path)
                print "PAGE NO", pageno, "with city =", city, "DONT EXIST"
                continue

    # print pdf_docs
    final_file_path = dir_path + "/" + todaysDate + ".pdf"
    pdf_merger.FileMerger(pdf_docs, final_file_path)

    ## if the compression dont compress to the required size than the last option is to eliminate some files;
    checkSizeFlag = checkFileSize.check(final_file_path)
    k = 1

    while checkSizeFlag:
        os.remove(
            final_file_path
        )  # have to remove becoz pdf_merger only merges if th ethe file dont exist

        pdf_merger.FileMerger(pdf_docs[:-k], final_file_path)
        checkSizeFlag = checkFileSize.check(final_file_path)
        print "++++++++++ Removed last %s" % (k), 'file +++++++++++++'
        k = k + 1
    ##=======================================================================================================

    ##Hindi text
    akhbaar = u'\u0905' + u'\u0916' + u'\u093c' + u'\u092c' + u'\u093e' + u'\u0930'
    dinakit = u'\u0926' + u'\u093f' + u'\u0928' + u'\u093e' + u'\u0902' + u'\u0915' + u'\u093f' + u'\u0924'
    # print akhbaar +' ' +dinakit

    subject = akhbaar + ' ' + dinakit + ' ' + todaysDate

    # file_path = dir_path + "/" + final_file_name

    ###for qpython -- files download in this directory
    # cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    # file_path = cd_dir_path + "/" + final_file_name

    try:
        ## done in case when the more than one scripts run simultaneously.
        ##this can happen when network is slow or the script 1 is taking long enough time to execute
        print "Checking if mail is already sent ..... "
        status = checkCronJob.checkCronStatus()
        print status
        if (status == 0):
            print "Mail has been sent already..."
            return
        print "SENDING EMAIL..............."
        send_email.send_mail(configg.fromaddr, configg.password,
                             configg.toaddr, subject, todaysDate + ".pdf",
                             final_file_path)

        pdf_docs.append(final_file_path)
        Delete_Files.del_files(pdf_docs)

        ##updating cron Flag file when the job is done for the day

        with open(
                '/home/gugli/Documents/script_py/Dainik_Jagron/checkCronStatus.txt',
                'w') as outFile:
            outFile.write(Cur_date.strfTime())

    except Exception as e:
        Delete_Files.del_files(pdf_docs)
        print "COULDNOT SEND MAIL...."
        print e