def download(self): if not os.path.exists(self.contents_file): print "Error: arXiv contents file %s does not exist" % (self.contents_file) sys.exit(1) # Change directory to source folder os.chdir(self.filedir) print "Press 'x' to break after the current download." while True: arxiv_file_line = fq.get(self.contents_file) if arxiv_file_line == None: break print "Processing ", arxiv_file_line return_code = call([self.s3_cmd_ex,'get','--add-header=x-amz-request-payer: requester','--skip-existing', arxiv_file_line]) if return_code != 0: print "Error downloading", arxiv_file_line break fq.pop(self.contents_file) # break if x was pressed if 'x' in nbRawInput('',timeout=1): print "Download suspended. Restart script to resume." break # Change directory to project current folder os.chdir(self.current_dir)
def download(self): if not os.path.exists(self.contents_file): print "Error: arXiv contents file %s does not exist" % ( self.contents_file) sys.exit(1) # Change directory to source folder os.chdir(self.filedir) print "Press 'x' to break after the current download." while True: arxiv_file_line = fq.get(self.contents_file) if arxiv_file_line == None: break print "Processing ", arxiv_file_line return_code = call([ self.s3_cmd_ex, 'get', '--add-header=x-amz-request-payer: requester', '--skip-existing', arxiv_file_line ]) if return_code != 0: print "Error downloading", arxiv_file_line break fq.pop(self.contents_file) # break if x was pressed if 'x' in nbRawInput('', timeout=1): print "Download suspended. Restart script to resume." break # Change directory to project current folder os.chdir(self.current_dir)
def retrieve_citations(self): if not os.path.exists(self.tmp_dir): os.mkdir(self.tmp_dir) #Creates arXiv_citationqueue.txt if it doesn't exist by finding all the gz files in the extract folder if not os.path.exists(self.citation_queue): call('find {source_dir}*.gz -type f > {target_file}'.format( source_dir=self.extract_dir, target_file=self.citation_queue), shell=True) # Initialise some variables batcher = Batch.Batch() while True: file_name = fq.get(self.citation_queue) if file_name is None: break arxiv_id = os.path.splitext(os.path.split(file_name)[1])[0] print "Retrieving citations", arxiv_id uncompressed_tmp = self.tmp_dir + arxiv_id if not os.path.exists(uncompressed_tmp): os.mkdir(uncompressed_tmp) returncode = call( ["tar", "xzf", file_name, "-C", uncompressed_tmp]) if ( returncode == 1 ): #there was an error, so perhaps its not a Tar file. Instead try to decompress with plain old gunzip print "trying to gunzip instead for " + file_name os.system("gunzip -c %s > %s" % (file_name, uncompressed_tmp + "/file.tex")) #Now process .tex files for tex_file_name in os.listdir(uncompressed_tmp): if not (tex_file_name.endswith('.tex') or tex_file_name.endswith('.bbl')): continue citations = self.settings["metadata_reader"].process( arxiv_id, uncompressed_tmp + '/' + tex_file_name) #Store the citations in BibServer self.store_citations(batcher, arxiv_id, citations) #print "CITATIONS for " + arxiv_id #print citations # Delete temporary files if call('rm -R ' + uncompressed_tmp + '*', shell=True): break fq.pop(self.citation_queue) batcher.clear()
def retrieve_citations(self): if not os.path.exists(self.tmp_dir): os.mkdir(self.tmp_dir) #Creates arXiv_citationqueue.txt if it doesn't exist by finding all the gz files in the extract folder if not os.path.exists(self.citation_queue): call('find {source_dir}*.gz -type f > {target_file}'.format( source_dir = self.extract_dir, target_file = self.citation_queue ) , shell = True) # Initialise some variables batcher = Batch.Batch() while True: file_name = fq.get(self.citation_queue) if file_name is None: break arxiv_id = os.path.splitext(os.path.split(file_name)[1])[0] print "Retrieving citations", arxiv_id uncompressed_tmp = self.tmp_dir + arxiv_id if not os.path.exists(uncompressed_tmp): os.mkdir(uncompressed_tmp) returncode = call(["tar", "xzf", file_name, "-C", uncompressed_tmp]) if (returncode == 1): #there was an error, so perhaps its not a Tar file. Instead try to decompress with plain old gunzip print "trying to gunzip instead for " + file_name os.system("gunzip -c %s > %s" % (file_name, uncompressed_tmp + "/file.tex")) #Now process .tex files for tex_file_name in os.listdir(uncompressed_tmp): if not (tex_file_name.endswith('.tex') or tex_file_name.endswith('.bbl')): continue citations = self.settings["metadata_reader"].process(arxiv_id, uncompressed_tmp + '/' + tex_file_name) #Store the citations in BibServer self.store_citations(batcher, arxiv_id, citations) #print "CITATIONS for " + arxiv_id #print citations # Delete temporary files if call('rm -R ' + uncompressed_tmp + '*', shell=True): break fq.pop(self.citation_queue) batcher.clear()
def main(): print 'Press "x" to break' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) if not os.path.exists(extract_dir): os.mkdir(extract_dir) if not os.path.exists(extraction_queue) or not RESUME: call('find {source_dir} -type f > {target_file}'.format( source_dir = bucket_dir, target_file = extraction_queue ) , shell = True) while True: file_name = fq.get(extraction_queue) if file_name is None: break print "Extracting bucket" , file_name if call(['tar','xf',file_name,'-C',tmp_dir]): # call returns 1 on error. break if call('find %s -name *.gz -type f -exec mv {} %s \;' % (tmp_dir, extract_dir), shell = True): break if call('rm -R ' + tmp_dir + '*', shell=True): break fq.pop(extraction_queue) # break if x was pressed if nbRawInput('',timeout=1) == 'x': print "Extraction suspended. Restart script to resume." break
def main(): print 'Press "x" to break' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) if not os.path.exists(extract_dir): os.mkdir(extract_dir) if not os.path.exists(extraction_queue) or not RESUME: call('find {source_dir} -type f > {target_file}'.format( source_dir=bucket_dir, target_file=extraction_queue), shell=True) while True: file_name = fq.get(extraction_queue) if file_name is None: break print "Extracting bucket", file_name if call(['tar', 'xf', file_name, '-C', tmp_dir]): # call returns 1 on error. break if call('find %s -name *.gz -type f -exec mv {} %s \;' % (tmp_dir, extract_dir), shell=True): break if call('rm -R ' + tmp_dir + '*', shell=True): break fq.pop(extraction_queue) # break if x was pressed if nbRawInput('', timeout=1) == 'x': print "Extraction suspended. Restart script to resume." break
def extract(self): print "Press 'x' to interupt the extraction process" if not os.path.exists(self.tmp_dir): os.mkdir(self.tmp_dir) if not os.path.exists(self.extract_dir): os.mkdir(self.extract_dir) #Creates arXiv_extraction_queue.txt if it doesn't exist by finding all the tar files in the download folder if not os.path.exists(self.extraction_queue): call('find {source_dir}*.tar -type f > {target_file}'.format( source_dir=self.filedir, target_file=self.extraction_queue), shell=True) while True: file_name = fq.get(self.extraction_queue) if file_name is None: break print "Extracting bucket", file_name if call(['tar', 'xf', file_name, '-C', self.tmp_dir]): # call returns 1 on error. break if call('find %s -name *.gz -type f -exec mv {} %s \;' % (self.tmp_dir, self.extract_dir), shell=True): break if call('rm -R ' + self.tmp_dir + '*', shell=True): break fq.pop(self.extraction_queue) # break if x was pressed if nbRawInput('', timeout=1) == 'x': print "Extraction suspended. Restart script to resume." break
def extract(self): print "Press 'x' to interupt the extraction process" if not os.path.exists(self.tmp_dir): os.mkdir(self.tmp_dir) if not os.path.exists(self.extract_dir): os.mkdir(self.extract_dir) #Creates arXiv_extraction_queue.txt if it doesn't exist by finding all the tar files in the download folder if not os.path.exists(self.extraction_queue): call('find {source_dir}*.tar -type f > {target_file}'.format( source_dir = self.filedir, target_file = self.extraction_queue ) , shell = True) while True: file_name = fq.get(self.extraction_queue) if file_name is None: break print "Extracting bucket" , file_name if call(['tar','xf',file_name,'-C',self.tmp_dir]): # call returns 1 on error. break if call('find %s -name *.gz -type f -exec mv {} %s \;' % (self.tmp_dir, self.extract_dir), shell = True): break if call('rm -R ' + self.tmp_dir + '*', shell=True): break fq.pop(self.extraction_queue) # break if x was pressed if nbRawInput('',timeout=1) == 'x': print "Extraction suspended. Restart script to resume." break
cur_dir = os.getcwd() contents_file = cur_dir + '/s3_contents.txt' s3_cmd_ex = cur_dir + "/../tools/s3cmd/s3cmd" dl_dir = cur_dir + '/../DATA/BUCKETS/' if not os.path.exists(dl_dir): os.makedirs(dl_dir) os.chdir(dl_dir) print "Press 'x' to suspend after the current download." while True: line = fq.get(contents_file) if line == None: break print "Processing ", line return_code = call([s3_cmd_ex,'get','--add-header=x-amz-request-payer: requester','--skip-existing',line]) if return_code != 0: print "ERROR downloading", line break fq.pop(contents_file) # break if x was pressed if 'x' in nbRawInput('',timeout=1): print "Download suspended. Restart script to resume." break
s3_cmd_ex = cur_dir + "/../tools/s3cmd/s3cmd" dl_dir = cur_dir + '/../DATA/BUCKETS/' if not os.path.exists(dl_dir): os.makedirs(dl_dir) os.chdir(dl_dir) print "Press 'x' to suspend after the current download." while True: line = fq.get(contents_file) if line == None: break print "Processing ", line return_code = call([ s3_cmd_ex, 'get', '--add-header=x-amz-request-payer: requester', '--skip-existing', line ]) if return_code != 0: print "ERROR downloading", line break fq.pop(contents_file) # break if x was pressed if 'x' in nbRawInput('', timeout=1): print "Download suspended. Restart script to resume." break