def store_zipped_file(filename, dirname):
	""" 
	Zips and uploads the file filename into the subdirectory dirname, then cd back out 
	to parent directory
	Exits on error and raises RuntimeError 
	"""
	global ftp 
	filezip = filename + '.zip'
	try:
		subprocess.call("zip " + filezip + ' ' + filename, shell=True)
		ftp.cwd(dirname)               # change into subdirectory
		ftp.storbinary("STOR " + filezip, open(filezip))
		ftp.cwd('..')               # back out
	except:   
		phox_utilities.do_RuntimeError('Store of',  filename, '.zip unsuccessful')
def main(thisday):
    scraperfilename = phox_utilities.Scraper_Stem + thisday + ".txt"
    print "Mongo: Scraper file name:", scraperfilename

    recordfilename = phox_utilities.Recordfile_Stem + thisday + ".txt"
    print "Mongo: Record file name:", recordfilename

    newsourcefile = newsourcestem + thisday + ".txt"
    print "Mongo: New Sources file name:", newsourcefile

    try:
        fin = open(scraperfilename, "r")
    except IOError:
        phox_utilities.do_RuntimeError("Could not find the scraper file for", thisday)

    finlist = fin.readlines()
    fout = open(recordfilename, "w")
    newout = open(newsourcefile, "w")
    sourcecount = {}

    storyno = 1
    csno = 1

    for line in range(0, len(finlist)):
        if "http" in finlist[line]:
            field = finlist[line].split("\t")
            thisURL = field[2][:-1]
            thisURL = thisURL[:MAX_URLLENGTH]  # temporary to accommodate TABARI input limits

            thisstory = get_story(finlist[line + 1])
            thisdate = get_date(field)
            thissource = get_source(field)

            if thissource == "999":
                print >> newout, thisURL  # Adds sources not included in sources dictionary to 'newsource_results_20..' file output

            write_record(thissource, sourcecount, thisdate, thisURL, thisstory, fout)

    fin.close()
    fout.close()
    print "Finished"
def get_zipped_file(filename, dirname):
	""" 
	Downloads the file filename+zip from the subdirectory dirname, reads into 
	tempfile.zip, cds back out to parent directory and unzips
	Exits on error and raises RuntimeError 
	"""
	global ftp 
	fbin = open('tempfile.zip','wb')
	try:
		ftp.cwd(dirname)               # change into subdirectory
		ftp.retrbinary("RETR " + filename + '.zip', fbin.write)
		ftp.cwd('..')               # back 
		phox_utilities.logger.info('Successfully retrieved ' + filename + '.zip\n')
	except:   
		phox_utilities.do_RuntimeError('Retrieval of',  filename, '.zip unsuccessful')
		return
	
	fbin.close()
	try:
		subprocess.call("unzip -o tempfile.zip", shell=True)  # -o: overwrite without prompting
		subprocess.call("rm tempfile.zip", shell=True)  # clean up
	except:   
		phox_utilities.do_RuntimeError('Downloaded file',  filename, 'could not be decompressed')
            field = finlist[line].split("\t")
            thisURL = field[2][:-1]
            thisURL = thisURL[:MAX_URLLENGTH]  # temporary to accommodate TABARI input limits

            thisstory = get_story(finlist[line + 1])
            thisdate = get_date(field)
            thissource = get_source(field)

            if thissource == "999":
                print >> newout, thisURL  # Adds sources not included in sources dictionary to 'newsource_results_20..' file output

            write_record(thissource, sourcecount, thisdate, thisURL, thisstory, fout)

    fin.close()
    fout.close()
    print "Finished"


if __name__ == "__main__":
    if len(sys.argv) > 2:  # initializations for stand-alone tests
        phox_utilities.init_logger("test_pipeline.log")
        logger = phox_utilities.logger  # get a local copy for the pipeline
        phox_utilities.parse_config("test_config.ini")  # initialize the various phox_utilities globals

    if len(sys.argv) > 1:
        thisday = sys.argv[1]
    else:
        phox_utilities.do_RuntimeError("No date suffix in Mongo.formatter.py")

    main(thisday)
def main(datestr):
	global fout, evtdict, DUPCOUNT, evtdup, curday
	try:
		fin = open(phox_utilities.Fullfile_Stem + datestr + '.txt', 'r')
	except IOError:
		phox_utilities.do_RuntimeError('Could not find the full event file for', datestr)

	eventfilename = phox_utilities.Eventfile_Stem + datestr + '.txt'
	fout = open(eventfilename, 'w')
	print 'Writing', eventfilename

	curday = '000000'
	dayno = 1
	line = fin.readline()
	while len(line) > 0:  # loop through the file
		field = line[:-1].split('\t')
#		print '--',field
		if field[0] != curday:
			writeevents()
			if curday != '000000':
				writedups(curday)
			curday = field[0]
			evtdict = {}
			evtdup = {}

		# string to check against for duplicates
		evt = field[1] + field[2] + field[3]
		src = field[5][0:3]
		field[6] = field[6][:16]  # debug -- readability
		field.append(1)
#		print evt
		if evt in evtdict:	# duplicate
#			print '++',field
			evtdict[evt][DUPCOUNT] += 1
	#		print evt, evtdict[evt][5], evtdict[evt][6]
			gotsrc = False
			for ka in range(len(evtdup[evt])):
				if evtdup[evt][ka][0] == src:
					evtdup[evt][ka][1] += 1
					evtdup[evt][ka].append(field[5])
					evtdup[evt][ka].append(field[6])
					gotsrc = True
					break
			if not gotsrc:	# new source
				evtdup[evt].append([src, 1, field[5], field[6]])

		else:
			evtdict[evt] = field
			evtdup[evt] = []

		dayno += 1
	#	if dayno > 128: sys.exit()   # debug

		line = fin.readline()

	fin.close()

	writeevents()  # write final day (which is only day in the pipeline call)
	fout.close()

	writedups(datestr)

	print "Finished"
def main(datestr):
	""" 
	When something goes amiss, various routines will and pass through a 
	RuntimeError(explanation) rather than trying to recover, since this probably means
	something is either wrong with the ftp connection or the file structure got 
	corrupted. This error is logged but needs to be caught in the calling program.
	"""
	global ftp	
	
	# log into the server
	try:
		ftp = FTP(phox_utilities.Server_List[0])     # connect to host, default port
		ftp.login(phox_utilities.Server_List[1], phox_utilities.Server_List[2])
		ftp.cwd(phox_utilities.Server_List[3])               # change into PHOX directory
		print 'Logged into:', phox_utilities.Server_List[0], '/', phox_utilities.Server_List[1]
	except:
		phox_utilities.do_RuntimeError('Login to', phox_utilities.Server_List[0], 'unsuccessful')

	# upload the daily event and duplicate index files
	try:
		eventfilename = phox_utilities.Eventfile_Stem + datestr + '.txt'
		store_zipped_file(eventfilename,'Daily')
	except:
		phox_utilities.do_RuntimeError('Transfer of', eventfilename,'unsuccessful')

	try:
		dupfilename = phox_utilities.Dupfile_Stem + datestr + '.txt'
		store_zipped_file(dupfilename,'Daily/Duplicates')
		ftp.cwd('..')               # back out one more level
	except:
		phox_utilities.do_RuntimeError('Transfer of', dupfilename,'unsuccessful')


	# update the monthly and yearly files
	monthfilename = phox_utilities.Outputfile_Stem + datestr[:2] + '-' + datestr[2:4] + '.txt'
	yearfilename = phox_utilities.Outputfile_Stem + datestr[:2] + '.txt'

	curyear = True
	if datestr[2:] == '0101': # initialize a new year
		subprocess.call("cp " + eventfilename + ' ' + yearfilename, shell=True)
		curyear = False  
	else:
		get_zipped_file(yearfilename, 'Annual')
		try:
			fyr = open(yearfilename,'a')  # this actually becomes a simple write rather than append when a new month  
		except:   
			phox_utilities.do_RuntimeError('Could not open yearly file', yearfilename)


	curmonth = True
	if datestr[4:] == '01': # just make a copy of the existing file with the DOC lines
		subprocess.call("cp " + eventfilename + ' ' + monthfilename, shell=True)  
		curmonth = False	
	else: # download existing files and append to it
		get_zipped_file(monthfilename, 'Monthly')
		try:
			fmon = open(monthfilename,'a')  # this actually becomes a simple write rather than append when a new month  
		except:   
			phox_utilities.do_RuntimeError('Could not open monthly file', monthfilename)

	if curyear or curmonth:
		try:
			fin = open(eventfilename,'r')   
		except:   
			phox_utilities.do_RuntimeError('Could not open the daily event file', eventfilename)

		line = fin.readline()
		while len(line) > 0:  # loop through the file
			if 'DOC\tDOC\t999' not in line:   # copy the new lines, skipping the documentation lines
				if curmonth: fmon.write(line)
				if curyear:  fyr.write(line)  
			line = fin.readline()

		fin.close()
		if curmonth: fmon.close()
		if curyear: fyr.close()

	store_zipped_file(monthfilename, 'Monthly')
	store_zipped_file(yearfilename, 'Annual')

	ftp.quit()
	print "Finished"