def store_zipped_file(filename, dirname): """ Zips and uploads the file filename into the subdirectory dirname, then cd back out to parent directory Exits on error and raises RuntimeError """ global ftp filezip = filename + '.zip' try: subprocess.call("zip " + filezip + ' ' + filename, shell=True) ftp.cwd(dirname) # change into subdirectory ftp.storbinary("STOR " + filezip, open(filezip)) ftp.cwd('..') # back out except: phox_utilities.do_RuntimeError('Store of', filename, '.zip unsuccessful')
def main(thisday): scraperfilename = phox_utilities.Scraper_Stem + thisday + ".txt" print "Mongo: Scraper file name:", scraperfilename recordfilename = phox_utilities.Recordfile_Stem + thisday + ".txt" print "Mongo: Record file name:", recordfilename newsourcefile = newsourcestem + thisday + ".txt" print "Mongo: New Sources file name:", newsourcefile try: fin = open(scraperfilename, "r") except IOError: phox_utilities.do_RuntimeError("Could not find the scraper file for", thisday) finlist = fin.readlines() fout = open(recordfilename, "w") newout = open(newsourcefile, "w") sourcecount = {} storyno = 1 csno = 1 for line in range(0, len(finlist)): if "http" in finlist[line]: field = finlist[line].split("\t") thisURL = field[2][:-1] thisURL = thisURL[:MAX_URLLENGTH] # temporary to accommodate TABARI input limits thisstory = get_story(finlist[line + 1]) thisdate = get_date(field) thissource = get_source(field) if thissource == "999": print >> newout, thisURL # Adds sources not included in sources dictionary to 'newsource_results_20..' file output write_record(thissource, sourcecount, thisdate, thisURL, thisstory, fout) fin.close() fout.close() print "Finished"
def get_zipped_file(filename, dirname): """ Downloads the file filename+zip from the subdirectory dirname, reads into tempfile.zip, cds back out to parent directory and unzips Exits on error and raises RuntimeError """ global ftp fbin = open('tempfile.zip','wb') try: ftp.cwd(dirname) # change into subdirectory ftp.retrbinary("RETR " + filename + '.zip', fbin.write) ftp.cwd('..') # back phox_utilities.logger.info('Successfully retrieved ' + filename + '.zip\n') except: phox_utilities.do_RuntimeError('Retrieval of', filename, '.zip unsuccessful') return fbin.close() try: subprocess.call("unzip -o tempfile.zip", shell=True) # -o: overwrite without prompting subprocess.call("rm tempfile.zip", shell=True) # clean up except: phox_utilities.do_RuntimeError('Downloaded file', filename, 'could not be decompressed')
field = finlist[line].split("\t") thisURL = field[2][:-1] thisURL = thisURL[:MAX_URLLENGTH] # temporary to accommodate TABARI input limits thisstory = get_story(finlist[line + 1]) thisdate = get_date(field) thissource = get_source(field) if thissource == "999": print >> newout, thisURL # Adds sources not included in sources dictionary to 'newsource_results_20..' file output write_record(thissource, sourcecount, thisdate, thisURL, thisstory, fout) fin.close() fout.close() print "Finished" if __name__ == "__main__": if len(sys.argv) > 2: # initializations for stand-alone tests phox_utilities.init_logger("test_pipeline.log") logger = phox_utilities.logger # get a local copy for the pipeline phox_utilities.parse_config("test_config.ini") # initialize the various phox_utilities globals if len(sys.argv) > 1: thisday = sys.argv[1] else: phox_utilities.do_RuntimeError("No date suffix in Mongo.formatter.py") main(thisday)
def main(datestr): global fout, evtdict, DUPCOUNT, evtdup, curday try: fin = open(phox_utilities.Fullfile_Stem + datestr + '.txt', 'r') except IOError: phox_utilities.do_RuntimeError('Could not find the full event file for', datestr) eventfilename = phox_utilities.Eventfile_Stem + datestr + '.txt' fout = open(eventfilename, 'w') print 'Writing', eventfilename curday = '000000' dayno = 1 line = fin.readline() while len(line) > 0: # loop through the file field = line[:-1].split('\t') # print '--',field if field[0] != curday: writeevents() if curday != '000000': writedups(curday) curday = field[0] evtdict = {} evtdup = {} # string to check against for duplicates evt = field[1] + field[2] + field[3] src = field[5][0:3] field[6] = field[6][:16] # debug -- readability field.append(1) # print evt if evt in evtdict: # duplicate # print '++',field evtdict[evt][DUPCOUNT] += 1 # print evt, evtdict[evt][5], evtdict[evt][6] gotsrc = False for ka in range(len(evtdup[evt])): if evtdup[evt][ka][0] == src: evtdup[evt][ka][1] += 1 evtdup[evt][ka].append(field[5]) evtdup[evt][ka].append(field[6]) gotsrc = True break if not gotsrc: # new source evtdup[evt].append([src, 1, field[5], field[6]]) else: evtdict[evt] = field evtdup[evt] = [] dayno += 1 # if dayno > 128: sys.exit() # debug line = fin.readline() fin.close() writeevents() # write final day (which is only day in the pipeline call) fout.close() writedups(datestr) print "Finished"
def main(datestr): """ When something goes amiss, various routines will and pass through a RuntimeError(explanation) rather than trying to recover, since this probably means something is either wrong with the ftp connection or the file structure got corrupted. This error is logged but needs to be caught in the calling program. """ global ftp # log into the server try: ftp = FTP(phox_utilities.Server_List[0]) # connect to host, default port ftp.login(phox_utilities.Server_List[1], phox_utilities.Server_List[2]) ftp.cwd(phox_utilities.Server_List[3]) # change into PHOX directory print 'Logged into:', phox_utilities.Server_List[0], '/', phox_utilities.Server_List[1] except: phox_utilities.do_RuntimeError('Login to', phox_utilities.Server_List[0], 'unsuccessful') # upload the daily event and duplicate index files try: eventfilename = phox_utilities.Eventfile_Stem + datestr + '.txt' store_zipped_file(eventfilename,'Daily') except: phox_utilities.do_RuntimeError('Transfer of', eventfilename,'unsuccessful') try: dupfilename = phox_utilities.Dupfile_Stem + datestr + '.txt' store_zipped_file(dupfilename,'Daily/Duplicates') ftp.cwd('..') # back out one more level except: phox_utilities.do_RuntimeError('Transfer of', dupfilename,'unsuccessful') # update the monthly and yearly files monthfilename = phox_utilities.Outputfile_Stem + datestr[:2] + '-' + datestr[2:4] + '.txt' yearfilename = phox_utilities.Outputfile_Stem + datestr[:2] + '.txt' curyear = True if datestr[2:] == '0101': # initialize a new year subprocess.call("cp " + eventfilename + ' ' + yearfilename, shell=True) curyear = False else: get_zipped_file(yearfilename, 'Annual') try: fyr = open(yearfilename,'a') # this actually becomes a simple write rather than append when a new month except: phox_utilities.do_RuntimeError('Could not open yearly file', yearfilename) curmonth = True if datestr[4:] == '01': # just make a copy of the existing file with the DOC lines subprocess.call("cp " + eventfilename + ' ' + monthfilename, shell=True) curmonth = False else: # download existing files and append to it get_zipped_file(monthfilename, 'Monthly') try: fmon = open(monthfilename,'a') # this actually becomes a simple write rather than append when a new month except: phox_utilities.do_RuntimeError('Could not open monthly file', monthfilename) if curyear or curmonth: try: fin = open(eventfilename,'r') except: phox_utilities.do_RuntimeError('Could not open the daily event file', eventfilename) line = fin.readline() while len(line) > 0: # loop through the file if 'DOC\tDOC\t999' not in line: # copy the new lines, skipping the documentation lines if curmonth: fmon.write(line) if curyear: fyr.write(line) line = fin.readline() fin.close() if curmonth: fmon.close() if curyear: fyr.close() store_zipped_file(monthfilename, 'Monthly') store_zipped_file(yearfilename, 'Annual') ftp.quit() print "Finished"