def main(): global index_dir, fp_id_title, max_file wiki_dump = sys.argv[1] index_dir = sys.argv[2] if not os.path.isdir(index_dir): os.mkdir(index_dir) parser = xml.sax.make_parser() #SAX Parser handler = WikiHandler() parser.setContentHandler(handler) start = time.time() fp_id_title = open_id_title(id_to_title) parser.parse(wiki_dump) write_remaining() # print("Number of iterations : ", len(os.listdir(index_dir))) fp_id_title.close() # itr = len(os.listdir(index_dir)) - 1 # print("Number of iterations : ", itr) merge.merge_files(total_itr, index_dir, max_file) clear_directory(total_itr, index_dir) end = time.time() total_docs = open(os.path.join(index_dir, "total_docs.txt"), "w") total_docs.write(str(nod)) total_docs.close() # total_folders = open(os.path.join(index_dir, "total_folders.txt"), "w") # total_folders.write(str(total_itr)) # total_folders.close() print("Time taken : ", end - start)
def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.FileHandler("debug.log"), logging.StreamHandler()]) logger = logging.getLogger(__name__) logger.info('start the program') logger.info('getting the urls') file_url = open_file() logger.info('got the urls and ready to download the sounds') file_download = sheet_url() logger.info(f'got {file_url}, ready to download') download_videos = download_youtube_files(logger, file_download) logger.info(f'got {download_videos}, ready to merge') list_for_csv = download_videos[0] make_file_csv(list_for_csv) insert_row_col(list_for_csv) insert_info(logger, list_for_csv) logger.info('finished making the csv file') list_for_mananger = download_videos[1] mananger_sounds(list_for_mananger) merge_files(logger, list_for_mananger, 'merge_alarm.mp3') logger.info('finished merge the sounds') upload_tweet(list_for_mananger) logger.info(f'end to the program')
def _merge(self): # On fait une jolie liste sortie comme on veut new_data = sorted(self._data, key=itemgetter('start')) # On merge file_list = [self._out_filename + '/' + d['out'] for d in new_data] merge_files(file_list, self._out_filename + '.new') # On vire le répertoire shutil.rmtree(self._out_filename) # On renomme le fichier os.rename(self._out_filename + '.new', self._out_filename)
def merge_pdfs(): fn = "" if verify_output_file_not_exist: messagebox.showerror( "File exists", "Error. File already exists. Try again with a different filename.") return if not e.get().endswith(".pdf"): fn = e.get() + ".pdf" else: fn = e.get() mg.merge_files(listbox.get(0, END), fn) messagebox.showinfo("File merged", "Saved output file " + fn)
def test_ordered_input_files(self): input_filenames = [ 'input/a.csv', 'input/b.csv', 'input/c.csv', ] generate_input_files(input_filenames, 15) for input_filename in input_filenames: self.assertTrue(input_filename) output_filename = 'output/bar.csv' with open(output_filename, 'w') as output_fp: merge_files(input_filenames, output_fp.write) self.assertTrue(output_filename)
if __name__ == '__main__': # First, lets handle the arguments" parser = argparse.ArgumentParser(description='Sort a huge file.') parser.add_argument('--input', help='File to sort') parser.add_argument('--output', help='Output file') parser.add_argument('--tempfile', help='Temporarily output pattern prefix (default: output)', default='output') parser.add_argument('--splitsize', help='Number of bytes in each split (default: 10000)', type=int, default=10000) args = parser.parse_args() # Let's split up the files in manageable smaller files splitted_files = split_file(args.input, '%s_{0:04d}.txt' % args.tempfile, args.splitsize) # Sort each individual file for split_file in splitted_files: sort(split_file, "%s_sorted" % split_file) splitted_files_sorted = ["%s_sorted" % filename for filename in splitted_files] # Merge all the files together again merge_files(args.output, splitted_files_sorted) # Let's clean up the mess we have temporarily created for filename in splitted_files + splitted_files_sorted: os.remove(filename) # Tada print "success"
tmp_files = [] # filters by PPG ID "producao" and "prod-autor" CSV files for both "artpe" and "anais" publication types for file_type in ["producao", "prod-autor"]: for prod_type in ["artpe", "anais"]: inputfile = f"data/{file_type}-2017a2020-{prod_type}.csv" outputfile = f"data/{file_type}-2017a2020-{prod_type}-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" filter.filter_file(inputfile, "CD_PROGRAMA_IES", ppg["CD_PROGRAMA_IES"], outputfile, True) tmp_files.append(outputfile) # merge filtered files joining "artpe" and "anais" publication types for file_type in ["producao", "prod-autor"]: basefile = f"data/{file_type}-2017a2020-anais-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" otherfile = f"data/{file_type}-2017a2020-artpe-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" outputfile = f"data/{file_type}-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" merge.merge_files(basefile, otherfile, outputfile, True) # normalize author names inputfile = f"data/prod-autor-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" outputfile = f"data/prod-autor-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}-normalized.csv" normalize.normalize_names(inputfile, outputfile) os.replace(outputfile, inputfile) # create graph from merged files authorsfile = f"data/prod-autor-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" papersfile = f"data/producao-2017a2020-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}.csv" outputfile = f"data/graph-{ppg['ENTIDADE_ENSINO']}-{ppg['ACRONYM']}-2017-2020.json" sucupira.export_graph(authorsfile, papersfile, outputfile) # cleaning up for f in tmp_files:
#from merge import merge_files import merge x = merge.merge_files('a1.txt', 'a2.txt', 'a3.txt', 'a33.txt', output='output.txt')