def archive_log_files(args, proc_cfg, proc_status): """Archive the log files for the current execution Args: args <args>: Command line arguments proc_cfg <ConfigParser>: Configuration proc_status <bool>: True = Success, False = Error """ base_log = cli_log_filename(args) proc_log = EspaLogging.get_filename(settings.PROCESSING_LOGGER) dist_path = proc_cfg.get('processing', 'espa_log_archive') destination_path = os.path.join(dist_path, args.order_id) # Create the archive path util.create_directory(destination_path) # Copy them copy_log_file(base_log, destination_path, proc_status) copy_log_file(proc_log, destination_path, proc_status) # Remove the source versions if os.path.exists(base_log): os.unlink(base_log) if os.path.exists(proc_log): os.unlink(proc_log)
def archive_log_files(order_id, product_id): """Archive the log files for the current job """ logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER) try: # Determine the destination path for the logs output_dir = Environment().get_distribution_directory() destination_path = os.path.join(output_dir, 'logs', order_id) # Create the path utilities.create_directory(destination_path) # Job log file logfile_path = EspaLogging.get_filename(settings.PROCESSING_LOGGER) full_logfile_path = os.path.abspath(logfile_path) log_name = os.path.basename(full_logfile_path) # Determine full destination destination_file = os.path.join(destination_path, log_name) # Copy it shutil.copyfile(full_logfile_path, destination_file) # Mapper log file full_logfile_path = os.path.abspath(MAPPER_LOG_FILENAME) final_log_name = '-'.join([MAPPER_LOG_PREFIX, order_id, product_id]) final_log_name = '.'.join([final_log_name, 'log']) # Determine full destination destination_file = os.path.join(destination_path, final_log_name) # Copy it shutil.copyfile(full_logfile_path, destination_file) except Exception: # We don't care because we are at the end of processing # And if we are on the successful path, we don't care either logger.exception("Exception encountered and follows")
def archive_log_files(order_id, product_id): """Archive the log files for the current job """ logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER) try: # Determine the destination path for the logs output_dir = Environment().get_distribution_directory() destination_path = os.path.join(output_dir, 'logs', order_id) # Create the path utilities.create_directory(destination_path) # Job log file logfile_path = EspaLogging.get_filename(settings.PROCESSING_LOGGER) full_logfile_path = os.path.abspath(logfile_path) log_name = os.path.basename(full_logfile_path) # Determine full destination destination_file = os.path.join(destination_path, log_name) # Copy it shutil.copyfile(full_logfile_path, destination_file) # Mapper log file full_logfile_path = os.path.abspath(MAPPER_LOG_FILENAME) final_log_name = '-'.join([MAPPER_LOG_PREFIX, order_id, product_id]) final_log_name = '.'.join([final_log_name, 'log']) # Determine full destination destination_file = os.path.join(destination_path, final_log_name) # Copy it shutil.copyfile(full_logfile_path, destination_file) except Exception: # We don't care because we are at the end of processing # And if we are on the successful path, we don't care either logger.exception('Exception encountered and follows')
def initialize_processing_directory(base_work_dir, bucket_name, directories=['output', 'stage', 'work']): """ Initializes the processing directory and subfolders Args: base_work_dir (str): relative or absolute path to base working directory bucket_name (str): additional subdirectory to work in directories (list): all subfolders to create undir base dir Returns: dict: created directories, keys by basename """ new_directories = dict() if os.path.exists(base_work_dir): logging.warning('Removing processing directory: %s', base_work_dir) shutil.rmtree(base_work_dir, ignore_errors=True) new_directories['base'] = work_dir = os.path.join(base_work_dir, bucket_name) logging.info('Create processing directory: %s', work_dir) utilities.create_directory(work_dir) for folder in dirs_to_make(work_dir, directories): logging.debug('Create directory: %s', folder) utilities.create_directory(folder) new_directories.update({os.path.basename(folder): folder}) return new_directories
def make_values_and_parameter_sweep(output_dir, bag_file, map_file, image_topic, config_path, robot_config, world, use_image_features): output_dir = utilities.create_directory(output_dir) print('Output directory for results is {}'.format(output_dir)) value_ranges, value_names = make_value_ranges() save_values(value_names, value_ranges, 'individual_value_ranges.csv', output_dir) all_value_combos = make_all_value_combinations(value_ranges) save_values(value_names, all_value_combos, 'all_value_combos.csv', output_dir) parameter_sweep(all_value_combos, value_names, output_dir, bag_file, map_file, image_topic, config_path, robot_config, world, use_image_features) combined_results_file = os.path.join(output_dir, 'param_sweep_combined_results.csv') value_combos_file = os.path.join(output_dir, 'all_value_combos.csv') results_pdf_file = os.path.join(output_dir, 'param_sweep_results.pdf') plot_parameter_sweep_results.create_plot(results_pdf_file, combined_results_file, value_combos_file) return output_dir
def generate_normalizers(datafolds, fsample, dataset_dir, cont_cols, normalize_options=('standardize', 'rescale')): for fold_name in datafolds: print("fold ", fold_name) train_idx = datafolds[fold_name][0] train_sample = fsample.loc[fsample['nrd_visitlink'].isin( train_idx)].copy() dsets = (train_sample, ) for norm_option in normalize_options: print("norm_option: ", norm_option) dirname = "{}_{}".format(fold_name, norm_option) cdir = create_directory(dirname, dataset_dir) if (norm_option == 'standardize'): normalizer = GaussianNormalizerInfo elif (norm_option == 'meanrange'): normalizer = MeanRangeNormalizerInfo elif (norm_option == 'rescale'): normalizer = RescaleNormalizerInfo for dset in dsets: a, b = get_feature_normalizer(dset, cont_cols, norm_option) ReaderWriter.dump_data( normalizer(a, b), os.path.join(cdir, ("{}_info.pkl".format(norm_option))))
def create_local_output_directory(base_path): ''' Description: Creates a local output directory. Note: "local" in this case means a standard directory. Returns: string: The fullpath to the "output" directory. Parameters: base_path - The location where to create the "output" directory under. ''' full_path = os.path.join(base_path, 'output') utilities.create_directory(full_path) return full_path
def tokenized_scrape_file(n_month, n_year, lang, out_dir): """ Read scrape file and tokenized the sentences Write the tokenized file """ if lang == "en": language = "English" elif lang == "hi": language = "Hindi" else: print("Invalid language code passed") return None work_dir = out_dir + "//" + n_month + "_" + n_year scrape_loc = work_dir + "//" + "_".join( ["scrape_file", lang, n_month, n_year]) tokenize_loc = work_dir + "//" + "_".join( ["tokenize", lang, n_month, n_year]) create_directory(tokenize_loc) fl_list = sorted(glob.glob(os.path.join(scrape_loc, "*.txt"))) for k, fl in enumerate(fl_list): print(os.path.basename(fl)) flname = tokenize_loc + "//tok_" + os.path.basename(fl) with open(fl, mode="r", encoding="utf-16") as file_n: para_val = [{ "text": line.strip() } for line in file_n if len(line.strip().split()) > 2] if len(para_val) > 500: sen = [] for i in range(int(np.ceil(len(para_val) / 500)) + 1): js = {"paragraphs": para_val[i * 500:(i + 1) * 500]} sen_sub = api_sen_tokenizer_call(js, lang) for line in sen_sub: sen.append(line) else: js = {"paragraphs": para_val} sen = api_sen_tokenizer_call(js, lang) dump_1 = (pd.DataFrame( sen, columns=["sen"]).drop_duplicates().loc[:, "sen"].values.tolist()) sen = dump_1 write_sentence_list_to_file(flname, sen) return None
def __create_local_directory(base_path, directory_name): ''' Description: Creates a local directory under the base path. Note: "local" in this case means a standard directory. Returns: string: The fullpath to the directory created. Parameters: string: base_path - The location where to create the directory. string: directory_name - The name of the directory to be created. ''' full_path = os.path.join(base_path, directory_name) utilities.create_directory(full_path) return full_path
def make_values_and_parameter_sweep(output_dir, bag_file, map_file, image_topic, gnc_config): output_dir = utilities.create_directory(output_dir) print('Output directory for results is {}'.format(output_dir)) value_ranges, value_names = make_value_ranges() save_values(value_names, value_ranges, 'individual_value_ranges.csv', output_dir) all_value_combos = make_all_value_combinations(value_ranges) save_values(value_names, all_value_combos, 'all_value_combos.csv', output_dir) parameter_sweep(all_value_combos, value_names, output_dir, bag_file, map_file, image_topic, gnc_config)
def make_values_and_parameter_sweep( output_dir, bag_file, map_file, image_topic, config_path, robot_config, world, use_image_features, groundtruth_bagfile, rmse_rel_start_time=0, rmse_rel_end_time=-1, ): output_dir = utilities.create_directory(output_dir) print(("Output directory for results is {}".format(output_dir))) value_ranges, value_names = make_value_ranges() parameter_sweep_utilities.save_values(value_names, value_ranges, "individual_value_ranges.csv", output_dir) all_value_combos = parameter_sweep_utilities.make_all_value_combinations( value_ranges) parameter_sweep_utilities.save_values(value_names, all_value_combos, "all_value_combos.csv", output_dir) parameter_sweep( all_value_combos, value_names, output_dir, bag_file, map_file, image_topic, config_path, robot_config, world, use_image_features, groundtruth_bagfile, rmse_rel_start_time, rmse_rel_end_time, ) combined_results_file = os.path.join(output_dir, "param_sweep_combined_results.csv") value_combos_file = os.path.join(output_dir, "all_value_combos.csv") results_pdf_file = os.path.join(output_dir, "param_sweep_results.pdf") plot_parameter_sweep_results.create_plots(results_pdf_file, combined_results_file, value_combos_file) return output_dir
def distribute_statistics_local(product_id, source_path, destination_path): ''' Description: Copies the statistics to the specified directory on the local system Parameters: product_id - The unique product ID associated with the files. source_path - The full path to where the statistics files to distribute reside. destination_path - The full path on the local system to copy the statistics files into. Note: - It is assumed a stats directory exists under the source_path - A stats directory will be created under the destination path ''' logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER) d_name = 'stats' # Save the current directory location and change to the source directory current_directory = os.getcwd() os.chdir(source_path) try: stats_path = os.path.join(destination_path, d_name) stats_files = ''.join([d_name, '/', product_id, '*']) # Create the statistics directory under the destination path logger.info("Creating directory {0}".format(stats_path)) utilities.create_directory(stats_path) # Remove any pre-existing statistics for this product ID cmd = ' '.join(['rm', '-f', os.path.join(destination_path, stats_files)]) output = '' try: output = utilities.execute_cmd(cmd) except Exception as e: raise ee.ESPAException(ee.ErrorCodes.distributing_product, str(e)), None, sys.exc_info()[2] finally: if len(output) > 0: logger.info(output) # Transfer the statistics files for file_path in glob.glob(stats_files): filename = os.path.basename(file_path) dest_file_path = os.path.join(stats_path, filename) logger.info("Copying {0} to {1}".format(filename, dest_file_path)) shutil.copyfile(file_path, dest_file_path) except Exception as e: logger.exception("An exception occurred processing {0}". format(product_id)) e_code = ee.ErrorCodes.distributing_product raise ee.ESPAException(e_code, str(e)), None, sys.exc_info()[2] finally: # Change back to the previous directory os.chdir(current_directory)
def Deposito(): import pandas as pd from utilities import Print_Error,get_files,Select_Menu,create_directory,OpenFile available_file = [] search_locations = [] save_location = "" main_config = open("CONFIG/MAIN.config") for line in main_config: if (line.split(';')[0] == "search_location"): search_locations.append(line.split(';')[1].strip()) elif (line.split(';')[0] == "save_location"): save_location=line.split(';')[1].strip() for file in get_files(search_locations): if(file.upper().endswith('.XLS')): available_file.append(file) file_name = Select_Menu(available_file,"Select a File",return_type=int) file_name = available_file[(file_name)] if (file_name.upper().endswith(".XLS")): print "Importing XLS File!" sheet = "LinnerBooking" df = pd.read_excel(io=file_name, sheet_name=sheet) df = df[['Booking','Deposito','Weight','Tipo Ctr']] df = df.loc[(df['Deposito'] == "MEDLOG SAN ANTONIO") | (df['Deposito'] == "SITRANS SAI ALTO DEPOT") | (df['Deposito'] == "SITRANS VALPARAISO DEPOT") |(df['Deposito'] == "MEDLOG SANTIAGO")] df['Weight'] = df['Weight']/1000 #Transformar a Tons. # df = df.loc[(df['Tipo Ctr'] == '20DV') | (df['Tipo Ctr'] == '40DV') | (df['Tipo Ctr'] == '40HC')] table = pd.pivot_table(df,values='Weight',aggfunc='count',index='Deposito',columns='Tipo Ctr') table = table.reindex(columns=['20DV', '40DV', '40HC']) table = table.rename(index={'MEDLOG SAN ANTONIO':'SAI','SITRANS SAI ALTO DEPOT':'SAI', 'SITRANS VALPARAISO DEPOT':'VAP','MEDLOG SANTIAGO':'STGO'}) table = table.groupby('Deposito').sum() # print table.iloc[0]['20DV'] import openpyxl import os wb = openpyxl.Workbook() sheet = wb.active list = [] print table data = [] for y in range(len(table.index)): data.append([]) for x in range(len(table.columns)): data[-1].append(table.iloc[y][x]) x = 1 z = 0 for deposit in data: r = 0 sheet.cell(1,x,str(table.index[z])) for value in deposit: sheet.cell(2,x,str(table.columns[r])) sheet.cell(3,x,float(value)) x+=1 r+=1 x+=1 z+=1 wb.save('demo.xlsx') wb.close() import subprocess if (save_location == ""): print "Saving Output in Program Location!" elif (not os.path.exists(save_location)): Print_Error("Save Directory Not Found!") create_directory(save_location) try: table.to_excel(save_location+'/file_output.xlsx') print "Saved Succesfully" except: Print_Error('Error Saving File!') directory = os.getcwd() + '/demo.xlsx' OpenFile(directory) else: Print_Error("File not compatible!")
def main(): parser = ArgumentParser() parser.add_argument( "--output-dir", help="output directory", type=str, required=True ) parser.add_argument("--month", help="month", type=str, required=True) parser.add_argument("--year", help="year", type=str, required=True) parser.add_argument( "--import-csv", help="yes/no : Whether to import existing csv file.Default is 'no'", type=str, default="no", ) args = parser.parse_args() main_dir = args.output_dir n_month, n_year = args.month.lower(), args.year work_dir = main_dir + "//" + n_month + "_" + n_year create_directory(main_dir) create_directory(work_dir) log_file_write = open(work_dir + "//scrape_en-hi_log_file.txt", mode="w") log_file_write.write(f"{n_month,n_year}\n") if args.import_csv.lower() == "yes": set_import = True elif args.import_csv.lower() == "no": set_import = False else: log_file_write.write(f"\n Please enter a valid option for import-csv") scrape_loc_en = work_dir + "//" + "scrape_file_en_" + n_month + "_" + n_year scrape_loc_hi = work_dir + "//" + "scrape_file_hi_" + n_month + "_" + n_year create_directory(scrape_loc_hi) create_directory(scrape_loc_en) url_file_loc = "file:///" + HTML_FOLDER + "//Press Information Bureau." filename_url_en = url_file_loc + "_en_" + n_month + "_" + n_year + ".html" filename_url_hi = url_file_loc + "_hi_" + n_month + "_" + n_year + ".html" ministy_pa_list = pd.read_csv( MINISTRY_NAME_PARALLEL_LOCATION, encoding="utf-16", ) parse_url_en = get_html(filename_url_en) parse_url_hi = get_html(filename_url_hi) no_of_result_en = int( (parse_url_en.find("div", {"class": "search_box_result"}).contents[0]).split()[ 1 ] ) no_of_result_hi = int( (parse_url_hi.find("div", {"class": "search_box_result"}).contents[0]).split()[ 1 ] ) log_file_write.write(f"\nNo of search result in {n_month} of {n_year}:") log_file_write.write(f"\n English: {no_of_result_en} \n Hindi: {no_of_result_hi}") log_file_write.write( f"\nNo of Ministry in English search result:\ {len(parse_url_en.findAll('h3',{'class':'font104'}))}" ) log_file_write.write( f"\nNo of Ministry in Hindi search result:\ {len(parse_url_hi.findAll('h3',{'class':'font104'}))}" ) # Import or Create english dataframe df_en = get_data( n_month, n_year, filename_url_en, ministy_pa_list, "en", log_file_write, import_data=set_import, import_data_dir=work_dir, ) if "PRID" not in df_en.columns.tolist(): df_en["PRID"] = df_en["Link"].apply(lambda x: x.split("=")[-1]) log_file_write.write(f"\n English Datframe \n") log_file_write.write(f"\n Datframe Info:\n") df_en.info(buf=log_file_write) # Write the English Dataframe df_en.to_csv( os.path.join(work_dir, "English_data_" + n_month + "_" + n_year + ".csv"), index=False, encoding="utf-16", ) # Scraping English Documents iter_f = df_en.shape[0] log_file_write.write("\nStarting scraping for English Document") for i in range(iter_f): en_scrape_file = ( scrape_loc_en + "//" + str(i).zfill(4) + "_en_" + "_".join(df_en.loc[i, ["English_Ministry_Name"]].values[0].split()) + "_" + df_en.loc[i, ["Posting_Date"]].values[0].strftime("%Y-%m-%d") + "_" + str(df_en.loc[i, ["PRID"]].values[0]) + ".txt" ) m = 0 while m == 0: try: b = get_html(df_en.Link[i], "lxml") m = b.body.form.find( "div", {"class": "innner-page-main-about-us-content-right-part"} ) except: log_file_write.write("\nerror:retrying") m = 0 if m is None: log_file_write.write( f"\nindex: {i}, Link: {df_en.Link[i]}, no english content found" ) continue k_en = [ str(k.get_text()).strip() for k in m.findAll( [ "div", "tr", "td", "p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li", ] ) if len( k.find_parents(["p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li"]) ) == 0 ] if len(k_en) == 0: log_file_write.write( f"\nindex: {i}, Link: {df_en.Link[i]},no English content in variuos tags" ) continue log_file_write.write(f"\nindex: {i}, number of lines: {len(k_en)}") write_scrape_text_file(en_scrape_file, k_en, df_en.English_Ministry_Name[i]) log_file_write.write(f"\nDone scraping for English Document") # Import or Create hindi dataframe df_hi = get_data( n_month, n_year, filename_url_hi, ministy_pa_list, "hi", log_file_write, import_data=set_import, import_data_dir=work_dir, ) if "PRID" not in df_hi.columns.tolist(): df_hi["PRID"] = df_hi["Link"].apply(lambda x: x.split("=")[-1]) log_file_write.write(f"\nHindi Datframe \n") log_file_write.write(f"\nDatframe Info:\n") df_hi.info(buf=log_file_write) # Write the Hindi Dataframe df_hi.to_csv( os.path.join(work_dir, "Hindi_data_" + n_month + "_" + n_year + ".csv"), index=False, encoding="utf-16", ) # Scraping Hindi Documents iter_f = df_hi.shape[0] log_file_write.write("\nStarting scraping for Hindi Document") for i in range(iter_f): hi_scrape_file = ( scrape_loc_hi + "//" + str(i).zfill(4) + "_hi_" + "_".join(df_hi.loc[i, ["English_Ministry_Name"]].values[0].split()) + "_" + df_hi.loc[i, ["Posting_Date"]].values[0].strftime("%Y-%m-%d") + "_" + str(df_hi.loc[i, ["PRID"]].values[0]) + ".txt" ) m = 0 while m == 0: try: b = get_html(df_hi.Link[i], "lxml") m = b.body.form.find( "div", {"class": "innner-page-main-about-us-content-right-part"} ) except: log_file_write.write("\nerror:retrying") m = 0 if m is None: log_file_write.write( f"\nindex: {i}, Link: {df_hi.Link[i]}, no hindi content found" ) continue k_hi = [ str(k.get_text()).strip() for k in m.findAll( [ "div", "tr", "td", "p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li", ] ) if len( k.find_parents(["p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li"]) ) == 0 ] if len(k_hi) == 0: log_file_write.write( f"\nindex: {i}, Link: {df_hi.Link[i]},no hindi content in variuos tags" ) continue log_file_write.write(f"\nindex: {i}, number of lines: {len(k_hi)}") write_scrape_text_file(hi_scrape_file, k_hi, df_hi.Hindi_Ministry_Name[i]) log_file_write.write("\nDone scraping for Hindi Document") log_file_write.close()
def distribute_statistics_local(immutability, product_id, source_path, destination_path): ''' Description: Copies the statistics to the specified directory on the local system Parameters: product_id - The unique product ID associated with the files. source_path - The full path to where the statistics files to distribute reside. destination_path - The full path on the local system to copy the statistics files into. Note: - It is assumed a stats directory exists under the source_path - A stats directory will be created under the destination path ''' logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER) d_name = 'stats' # Save the current directory location and change to the source directory current_directory = os.getcwd() os.chdir(source_path) try: stats_wildcard = ''.join([product_id, '*']) stats_path = os.path.join(destination_path, d_name) stats_files = os.path.join(d_name, stats_wildcard) dest_stats_wildcard = os.path.join(stats_path, stats_wildcard) # Create the statistics directory under the destination path logger.info("Creating directory {0}".format(stats_path)) utilities.create_directory(stats_path) # Change the attributes on the files so that we can remove them if immutability: cmd = ' '.join(['sudo', 'chattr', '-if', dest_stats_wildcard]) output = '' try: output = utilities.execute_cmd(cmd) except Exception: pass finally: if len(output) > 0: logger.info(output) # Remove any pre-existing statistics for this product ID cmd = ' '.join(['rm', '-f', dest_stats_wildcard]) output = '' try: output = utilities.execute_cmd(cmd) finally: if len(output) > 0: logger.info(output) # Transfer the statistics files for file_path in glob.glob(stats_files): filename = os.path.basename(file_path) dest_file_path = os.path.join(stats_path, filename) logger.info("Copying {0} to {1}".format(filename, dest_file_path)) shutil.copyfile(file_path, dest_file_path) # Change the attributes on the files so that we can't remove them if immutability: cmd = ' '.join(['sudo', 'chattr', '+i', dest_stats_wildcard]) output = '' try: output = utilities.execute_cmd(cmd) finally: if len(output) > 0: logger.info(output) except Exception: logger.exception('An exception occurred processing {0}'. format(product_id)) raise finally: # Change back to the previous directory os.chdir(current_directory)
def package_product(immutability, source_directory, destination_directory, product_name): ''' Description: Package the contents of the source directory into a gzipped tarball located in the destination directory and generates a checksum file for it. The filename will be prefixed with the specified product name. Returns: product_full_path - The full path to the product including filename cksum_full_path - The full path to the check sum including filename cksum_value - The checksum value ''' logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER) product_full_path = os.path.join(destination_directory, product_name) # Make sure the directory exists. utilities.create_directory(destination_directory) # Remove any pre-existing files # Grab the first part of the filename, which is not unique filename_parts = product_full_path.split('-') filename_parts[-1] = '*' # Replace the last element of the list filename = '-'.join(filename_parts) # Join with '-' # Name of the checksum to be created cksum_filename = '.'.join([product_name, settings.ESPA_CHECKSUM_EXTENSION]) # Change the attributes on the files so that we can remove them if immutability: cmd = ' '.join(['sudo', 'chattr', '-if', filename, cksum_filename]) output = '' try: output = utilities.execute_cmd(cmd) except Exception: pass finally: if len(output) > 0: logger.info(output) # Remove the file first just in-case this is a second run cmd = ' '.join(['rm', '-f', filename]) output = '' try: output = utilities.execute_cmd(cmd) finally: if len(output) > 0: logger.info(output) # Change to the source directory current_directory = os.getcwd() os.chdir(source_directory) try: # Tar the files logger.info("Packaging completed product to %s.tar.gz" % product_full_path) # Grab the files to tar and gzip product_files = glob.glob("*") # Execute tar with zipping, the full/path/*.tar.gz name is returned product_full_path = utilities.tar_files(product_full_path, product_files, gzip=True) # Change file permissions logger.info("Changing file permissions on %s to 0644" % product_full_path) os.chmod(product_full_path, 0644) # Verify that the archive is good output = '' cmd = ' '.join(['tar', '-tf', product_full_path]) try: output = utilities.execute_cmd(cmd) finally: if len(output) > 0: logger.info(output) # If it was good create a checksum file cksum_output = '' cmd = ' '.join([settings.ESPA_CHECKSUM_TOOL, product_full_path]) try: cksum_output = utilities.execute_cmd(cmd) finally: if len(cksum_output) > 0: logger.info(cksum_output) # Get the base filename of the file that was checksum'd cksum_prod_filename = os.path.basename(product_full_path) logger.debug("Checksum file = %s" % cksum_filename) logger.debug("Checksum'd file = %s" % cksum_prod_filename) # Make sure they are strings cksum_values = cksum_output.split() cksum_value = "%s %s" % (str(cksum_values[0]), str(cksum_prod_filename)) logger.info("Generating cksum: %s" % cksum_value) cksum_full_path = os.path.join(destination_directory, cksum_filename) try: with open(cksum_full_path, 'wb+') as cksum_fd: cksum_fd.write(cksum_value) except Exception: logger.exception('Error building checksum file') raise finally: # Change back to the previous directory os.chdir(current_directory) return (product_full_path, cksum_full_path, cksum_value)
def main(): parser = ArgumentParser() parser.add_argument( "--output-dir", help="output directory", type=str, required=True ) parser.add_argument("--month", help="month", type=str, required=True) parser.add_argument("--year", help="year", type=str, required=True) args = parser.parse_args() n_month, n_year = str(args.month).lower(), str(args.year) work_dir = args.output_dir + "//" + n_month + "_" + n_year align_loc = work_dir + "//" + "align_" + n_month + "_" + n_year tokenize_loc_en = work_dir + "//" + "tokenize_en_" + n_month + "_" + n_year tokenize_loc_hi = work_dir + "//" + "tokenize_hi_" + n_month + "_" + n_year submit_aligner = work_dir + "//" + "submit_aligner_" + n_month + "_" + n_year en_data_file = "_".join(["English", "data", n_month, n_year]) + ".csv" hi_data_file = "_".join(["Hindi", "data", n_month, n_year]) + ".csv" create_directory(align_loc) create_directory(submit_aligner) df_en = pd.read_csv(work_dir + "//" + en_data_file, encoding="utf-16") df_hi = pd.read_csv(work_dir + "//" + hi_data_file, encoding="utf-16") df_en = preprocess_dataframe(df_en) df_hi = preprocess_dataframe(df_hi) df_en.to_csv(work_dir + "//" + en_data_file, index=False, encoding="utf-16") df_hi.to_csv(work_dir + "//" + hi_data_file, index=False, encoding="utf-16") # Crete files which are parallel based on Ministry Name and Posting Date k_hi = pd.DataFrame( df_hi[["English_Ministry_Name", "Posting_Date", "index"]] .groupby(["English_Ministry_Name", "Posting_Date"])["index"] .apply(lambda x: x.tolist()) ) k_en = pd.DataFrame( df_en[["English_Ministry_Name", "Posting_Date", "index"]] .groupby(["English_Ministry_Name", "Posting_Date"])["index"] .apply(lambda x: x.tolist()) ) k_merge = pd.merge( k_en, k_hi, left_index=True, right_index=True, how="inner", suffixes=("_en", "_hi"), ) k_merge.to_csv( work_dir + "//" + "submit_aligner_" + n_month + "_" + n_year + ".csv", index=True, encoding="utf-16", ) fl_tok_en = sorted(glob.glob(tokenize_loc_en + "//" + "*.txt")) fl_tok_hi = sorted(glob.glob(tokenize_loc_hi + "//" + "*.txt")) no_sen_df = pd.DataFrame( columns=[ "Filename_en", "Total_sentences_en", "Filename_hi", "Total_sentences_hi", ] ) for count, i in enumerate(k_merge.iterrows()): en_align_file = ( submit_aligner + "//subalign_" + str(count).zfill(4) + "_en_" + "_".join(i[0][0].split()) + "_" + i[0][1].strftime("%Y-%m-%d") + ".txt" ) hi_align_file = ( submit_aligner + "//subalign_" + str(count).zfill(4) + "_hi_" + "_".join(i[0][0].split()) + "_" + i[0][1].strftime("%Y-%m-%d") + ".txt" ) with open(en_align_file, encoding="utf-16", mode="w") as flw_en: count_en = 0 for ind in i[1]["index_en"]: with open(fl_tok_en[ind], encoding="utf-16", mode="r") as flr_en: k_en = flr_en.read() count_en += k_en.count("\n") flw_en.write(k_en) with open(hi_align_file, encoding="utf-16", mode="w") as flw_hi: count_hi = 0 for ind in i[1]["index_hi"]: with open(fl_tok_hi[ind], encoding="utf-16", mode="r") as flr_hi: k_hi = flr_hi.read() count_hi += k_hi.count("\n") flw_hi.write(k_hi) no_sen_df = no_sen_df.append( { "Filename_en": os.path.basename(en_align_file), "Total_sentences_en": count_en, "Filename_hi": os.path.basename(hi_align_file), "Total_sentences_hi": count_hi, }, ignore_index=True, ) print( f"Writing {os.path.basename(en_align_file)} and {os.path.basename(hi_align_file)} done" ) no_sen_df.to_csv( work_dir + "//" + "tok_sen_count_" + n_month + "_" + n_year + ".csv", index=False, encoding="utf-16", ) fl_list = glob.glob(submit_aligner + "//" + "*.txt") en_fl = sorted([i for i in fl_list if os.path.basename(i).split("_")[2] == "en"]) hi_fl = sorted([i for i in fl_list if os.path.basename(i).split("_")[2] == "hi"]) c_fl = list(zip(en_fl, hi_fl)) for i in c_fl: extract_bitext(BEARER_TOKEN, align_loc, i[0], i[1])
def scrape_pib_archives(df_data, month, year, lang, out_dir, list_ministry): """ Scrape text using the links provided in dataframe Create and write a new dataframe with postin date-time """ print( f'Scraping for {month}, {year}, {"English" if lang=="en" else "Hindi"}' ) n_month, n_year = str(month), str(year) n_month = n_month.lower() main_dir = out_dir work_dir = main_dir + "//" + n_month + "_" + n_year create_directory(work_dir) if lang == "en": language = "English" language_2 = "Hindi" scrape_loc = work_dir + "//" + "scrape_file_en_" + n_month + "_" + n_year elif lang == "hi": language = "Hindi" language_2 = "English" scrape_loc = work_dir + "//" + "scrape_file_hi_" + n_month + "_" + n_year else: print("Pass valid language code") return None create_directory(scrape_loc) df_data[language + "_Ministry_Name"] = [""] * df_data.shape[0] df_data[language_2 + "_Ministry_Name"] = [""] * df_data.shape[0] df_data["Posting_Datetime"] = [pd.to_datetime(np.nan)] * df_data.shape[0] df_data["Posting_Date"] = df_data["Posting_Datetime"].apply( lambda x: x.date()) for p_th in range(df_data.shape[0])[:]: b_source = get_html(df_data.loc[p_th, "Link"]) m_dt = b_source.find("div", attrs={"class": "mddiv content-ministry"}) m = b_source.find("div", attrs={"class": "contentdiv"}) df_data.at[p_th, language + "_Ministry_Name"] = str(" ".join( m_dt.contents[0].strip().split())) if (str(" ".join(m_dt.contents[0].strip().split())) not in list_ministry[language + "_Ministry_Name"].values.tolist()): print( "Ministry name missing:", str(" ".join(m_dt.contents[0].strip().split())), ) else: df_data.at[p_th, language_2 + "_Ministry_Name"] = list_ministry[ list_ministry[language + "_Ministry_Name"] == df_data.at[ p_th, language + "_Ministry_Name"]][language_2 + "_Ministry_Name"].values[0] df_data.at[p_th, "Posting_Datetime"] = pd.to_datetime( (" ".join(m_dt.contents[1].text.split()[:-1])).replace( ".", ":").replace(": ", ":")) df_data.at[p_th, "Posting_Date"] = df_data.at[p_th, "Posting_Datetime"].date() scrape_file = ( scrape_loc + "//" + str(p_th).zfill(4) + "_" + lang + "_" + "_".join(df_data.loc[p_th, ["English_Ministry_Name"]].values[0].split()) + "_" + df_data.loc[p_th, ["Posting_Date"]].values[0].strftime("%Y-%m-%d") + "_" + df_data.loc[p_th, "Link"].split("=")[-1] + ".txt") k_en = [ str(k.get_text()).strip() for k in m.findAll([ "div", "tr", "td", "p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li", ]) if len( k.find_parents([ "p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li" ])) == 0 ] with open(scrape_file, mode="w", encoding="utf-16") as file_w: for line in k_en: if "@font-face" in line.strip(): continue line = re.sub("\r\n-", "\n-", line) line = re.sub("\.\s+\r\n", ".\n", line) # print(line) line = re.sub(":\s+\r\n", ":\n", line) line = re.sub(";\s+\r\n", ";\n", line) line = line.replace("\r\n", " ") for ln in line.split("\n"): ln = ln.strip() if len(ln.strip()) == 0: continue if "@font-face" in ln.strip(): continue ln = " ".join(ln.split()) file_w.write(ln.strip().replace("\r", "") + "\n") print(df_data.info()) if True: df_data.to_csv( os.path.join(work_dir, language + "_data_" + n_month + "_" + n_year + ".csv"), index=False, encoding="utf-16", )
param_range_directory_for_bag = parameter_sweep.make_values_and_parameter_sweep( bag_output_dir, graph_bag_params.bagfile, graph_bag_params.map_file, graph_bag_params.image_topic, graph_bag_params.config_path, graph_bag_params.robot_config_file, graph_bag_params.world, graph_bag_params.use_image_features) if not param_range_directory: param_range_directory = param_range_directory_for_bag combined_results_csv_files.append(os.path.join(bag_output_dir, 'param_sweep_combined_results.csv')) average_parameter_sweep_results(combined_results_csv_files, output_dir) save_ranges(param_range_directory, output_dir) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('config_file') parser.add_argument('output_dir') args = parser.parse_args() if not os.path.isfile(args.config_file): print('Config file ' + args.config_file + ' does not exist.') sys.exit() if os.path.isdir(args.output_dir): print('Output directory ' + args.output_dir + ' already exists.') sys.exit() output_dir = utilities.create_directory(args.output_dir) graph_bag_params_list = bag_sweep.load_params(args.config_file) bag_and_parameter_sweep(graph_bag_params_list, output_dir) combined_results_file = os.path.join(output_dir, 'bag_and_param_sweep_stats.csv') value_combos_file = os.path.join(output_dir, 'all_value_combos.csv') results_pdf_file = os.path.join(output_dir, 'bag_and_param_sweep_results.pdf') plot_parameter_sweep_results.create_plots(results_pdf_file, combined_results_file, value_combos_file)
n_year + ".csv"), index=False, encoding="utf-16", ) if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--output-dir", help="output directory", type=str, required=True) parser.add_argument("--month", help="month", type=str, required=True) parser.add_argument("--year", help="year", type=str, required=True) args = parser.parse_args() create_directory(args.output_dir) # creating release id and url link datframe # also creating ministry list df_en, list_ministry_en = get_prid_and_ministry_list( args.month, args.year, "en") df_hi, list_ministry_hi = get_prid_and_ministry_list( args.month, args.year, "hi") if len(list_ministry_en) == len(list_ministry_en): print(len(list_ministry_en), len(list_ministry_en)) ministry_data = pd.DataFrame( list(zip(list_ministry_en, list_ministry_hi)), columns=["English_Ministry_Name", "Hindi_Ministry_Name"], ) # Scraping the url links scrape_pib_archives(df_en, args.month, args.year, "en", args.output_dir, ministry_data)