예제 #1
0
def archive_log_files(args, proc_cfg, proc_status):
    """Archive the log files for the current execution

    Args:
        args <args>: Command line arguments
        proc_cfg <ConfigParser>: Configuration
        proc_status <bool>: True = Success, False = Error
    """

    base_log = cli_log_filename(args)
    proc_log = EspaLogging.get_filename(settings.PROCESSING_LOGGER)
    dist_path = proc_cfg.get('processing', 'espa_log_archive')
    destination_path = os.path.join(dist_path, args.order_id)

    # Create the archive path
    util.create_directory(destination_path)

    # Copy them
    copy_log_file(base_log, destination_path, proc_status)
    copy_log_file(proc_log, destination_path, proc_status)

    # Remove the source versions
    if os.path.exists(base_log):
        os.unlink(base_log)

    if os.path.exists(proc_log):
        os.unlink(proc_log)
def archive_log_files(order_id, product_id):
    """Archive the log files for the current job
    """

    logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER)

    try:
        # Determine the destination path for the logs
        output_dir = Environment().get_distribution_directory()
        destination_path = os.path.join(output_dir, 'logs', order_id)
        # Create the path
        utilities.create_directory(destination_path)

        # Job log file
        logfile_path = EspaLogging.get_filename(settings.PROCESSING_LOGGER)
        full_logfile_path = os.path.abspath(logfile_path)
        log_name = os.path.basename(full_logfile_path)
        # Determine full destination
        destination_file = os.path.join(destination_path, log_name)
        # Copy it
        shutil.copyfile(full_logfile_path, destination_file)

        # Mapper log file
        full_logfile_path = os.path.abspath(MAPPER_LOG_FILENAME)
        final_log_name = '-'.join([MAPPER_LOG_PREFIX, order_id, product_id])
        final_log_name = '.'.join([final_log_name, 'log'])
        # Determine full destination
        destination_file = os.path.join(destination_path, final_log_name)
        # Copy it
        shutil.copyfile(full_logfile_path, destination_file)

    except Exception:
        # We don't care because we are at the end of processing
        # And if we are on the successful path, we don't care either
        logger.exception("Exception encountered and follows")
예제 #3
0
def archive_log_files(order_id, product_id):
    """Archive the log files for the current job
    """

    logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER)

    try:
        # Determine the destination path for the logs
        output_dir = Environment().get_distribution_directory()
        destination_path = os.path.join(output_dir, 'logs', order_id)
        # Create the path
        utilities.create_directory(destination_path)

        # Job log file
        logfile_path = EspaLogging.get_filename(settings.PROCESSING_LOGGER)
        full_logfile_path = os.path.abspath(logfile_path)
        log_name = os.path.basename(full_logfile_path)
        # Determine full destination
        destination_file = os.path.join(destination_path, log_name)
        # Copy it
        shutil.copyfile(full_logfile_path, destination_file)

        # Mapper log file
        full_logfile_path = os.path.abspath(MAPPER_LOG_FILENAME)
        final_log_name = '-'.join([MAPPER_LOG_PREFIX, order_id, product_id])
        final_log_name = '.'.join([final_log_name, 'log'])
        # Determine full destination
        destination_file = os.path.join(destination_path, final_log_name)
        # Copy it
        shutil.copyfile(full_logfile_path, destination_file)

    except Exception:
        # We don't care because we are at the end of processing
        # And if we are on the successful path, we don't care either
        logger.exception('Exception encountered and follows')
예제 #4
0
def archive_log_files(args, proc_cfg, proc_status):
    """Archive the log files for the current execution

    Args:
        args <args>: Command line arguments
        proc_cfg <ConfigParser>: Configuration
        proc_status <bool>: True = Success, False = Error
    """

    base_log = cli_log_filename(args)
    proc_log = EspaLogging.get_filename(settings.PROCESSING_LOGGER)
    dist_path = proc_cfg.get('processing', 'espa_log_archive')
    destination_path = os.path.join(dist_path, args.order_id)

    # Create the archive path
    util.create_directory(destination_path)

    # Copy them
    copy_log_file(base_log, destination_path, proc_status)
    copy_log_file(proc_log, destination_path, proc_status)

    # Remove the source versions
    if os.path.exists(base_log):
        os.unlink(base_log)

    if os.path.exists(proc_log):
        os.unlink(proc_log)
def initialize_processing_directory(base_work_dir, bucket_name, directories=['output', 'stage', 'work']):
    """ Initializes the processing directory and subfolders

    Args:
        base_work_dir (str): relative or absolute path to base working directory
        bucket_name (str): additional subdirectory to work in
        directories (list): all subfolders to create undir base dir

    Returns:
        dict: created directories, keys by basename
    """
    new_directories = dict()
    if os.path.exists(base_work_dir):
        logging.warning('Removing processing directory: %s', base_work_dir)
        shutil.rmtree(base_work_dir, ignore_errors=True)

    new_directories['base'] = work_dir = os.path.join(base_work_dir, bucket_name)
    logging.info('Create processing directory: %s', work_dir)
    utilities.create_directory(work_dir)

    for folder in dirs_to_make(work_dir, directories):
        logging.debug('Create directory: %s', folder)
        utilities.create_directory(folder)
        new_directories.update({os.path.basename(folder): folder})
    return new_directories
예제 #6
0
def make_values_and_parameter_sweep(output_dir, bag_file, map_file,
                                    image_topic, config_path, robot_config,
                                    world, use_image_features):
    output_dir = utilities.create_directory(output_dir)
    print('Output directory for results is {}'.format(output_dir))

    value_ranges, value_names = make_value_ranges()
    save_values(value_names, value_ranges, 'individual_value_ranges.csv',
                output_dir)

    all_value_combos = make_all_value_combinations(value_ranges)
    save_values(value_names, all_value_combos, 'all_value_combos.csv',
                output_dir)

    parameter_sweep(all_value_combos, value_names, output_dir, bag_file,
                    map_file, image_topic, config_path, robot_config, world,
                    use_image_features)
    combined_results_file = os.path.join(output_dir,
                                         'param_sweep_combined_results.csv')
    value_combos_file = os.path.join(output_dir, 'all_value_combos.csv')
    results_pdf_file = os.path.join(output_dir, 'param_sweep_results.pdf')
    plot_parameter_sweep_results.create_plot(results_pdf_file,
                                             combined_results_file,
                                             value_combos_file)
    return output_dir
def generate_normalizers(datafolds,
                         fsample,
                         dataset_dir,
                         cont_cols,
                         normalize_options=('standardize', 'rescale')):
    for fold_name in datafolds:
        print("fold ", fold_name)
        train_idx = datafolds[fold_name][0]
        train_sample = fsample.loc[fsample['nrd_visitlink'].isin(
            train_idx)].copy()
        dsets = (train_sample, )
        for norm_option in normalize_options:
            print("norm_option: ", norm_option)
            dirname = "{}_{}".format(fold_name, norm_option)
            cdir = create_directory(dirname, dataset_dir)
            if (norm_option == 'standardize'):
                normalizer = GaussianNormalizerInfo
            elif (norm_option == 'meanrange'):
                normalizer = MeanRangeNormalizerInfo
            elif (norm_option == 'rescale'):
                normalizer = RescaleNormalizerInfo
            for dset in dsets:
                a, b = get_feature_normalizer(dset, cont_cols, norm_option)
                ReaderWriter.dump_data(
                    normalizer(a, b),
                    os.path.join(cdir, ("{}_info.pkl".format(norm_option))))
예제 #8
0
def create_local_output_directory(base_path):
    '''
    Description:
        Creates a local output directory.

    Note: "local" in this case means a standard directory.

    Returns:
        string: The fullpath to the "output" directory.

    Parameters:
        base_path - The location where to create the "output" directory under.
    '''

    full_path = os.path.join(base_path, 'output')

    utilities.create_directory(full_path)

    return full_path
예제 #9
0
def tokenized_scrape_file(n_month, n_year, lang, out_dir):
    """
    Read scrape file and tokenized the sentences
    Write the tokenized file
    """
    if lang == "en":
        language = "English"
    elif lang == "hi":
        language = "Hindi"
    else:
        print("Invalid language code passed")
        return None
    work_dir = out_dir + "//" + n_month + "_" + n_year
    scrape_loc = work_dir + "//" + "_".join(
        ["scrape_file", lang, n_month, n_year])
    tokenize_loc = work_dir + "//" + "_".join(
        ["tokenize", lang, n_month, n_year])
    create_directory(tokenize_loc)
    fl_list = sorted(glob.glob(os.path.join(scrape_loc, "*.txt")))
    for k, fl in enumerate(fl_list):
        print(os.path.basename(fl))
        flname = tokenize_loc + "//tok_" + os.path.basename(fl)
        with open(fl, mode="r", encoding="utf-16") as file_n:
            para_val = [{
                "text": line.strip()
            } for line in file_n if len(line.strip().split()) > 2]
        if len(para_val) > 500:
            sen = []
            for i in range(int(np.ceil(len(para_val) / 500)) + 1):
                js = {"paragraphs": para_val[i * 500:(i + 1) * 500]}
                sen_sub = api_sen_tokenizer_call(js, lang)
                for line in sen_sub:
                    sen.append(line)
        else:
            js = {"paragraphs": para_val}
            sen = api_sen_tokenizer_call(js, lang)
        dump_1 = (pd.DataFrame(
            sen, columns=["sen"]).drop_duplicates().loc[:,
                                                        "sen"].values.tolist())
        sen = dump_1
        write_sentence_list_to_file(flname, sen)
    return None
예제 #10
0
def __create_local_directory(base_path, directory_name):
    '''
    Description:
        Creates a local directory under the base path.

    Note: "local" in this case means a standard directory.

    Returns:
        string: The fullpath to the directory created.

    Parameters:
        string: base_path - The location where to create the directory.
        string: directory_name - The name of the directory to be created.
    '''

    full_path = os.path.join(base_path, directory_name)

    utilities.create_directory(full_path)

    return full_path
예제 #11
0
def make_values_and_parameter_sweep(output_dir, bag_file, map_file, image_topic, gnc_config):
  output_dir = utilities.create_directory(output_dir)
  print('Output directory for results is {}'.format(output_dir))

  value_ranges, value_names = make_value_ranges()
  save_values(value_names, value_ranges, 'individual_value_ranges.csv', output_dir)

  all_value_combos = make_all_value_combinations(value_ranges)
  save_values(value_names, all_value_combos, 'all_value_combos.csv', output_dir)

  parameter_sweep(all_value_combos, value_names, output_dir, bag_file, map_file, image_topic, gnc_config)
예제 #12
0
def __create_local_directory(base_path, directory_name):
    '''
    Description:
        Creates a local directory under the base path.

    Note: "local" in this case means a standard directory.

    Returns:
        string: The fullpath to the directory created.

    Parameters:
        string: base_path - The location where to create the directory.
        string: directory_name - The name of the directory to be created.
    '''

    full_path = os.path.join(base_path, directory_name)

    utilities.create_directory(full_path)

    return full_path
예제 #13
0
def make_values_and_parameter_sweep(
    output_dir,
    bag_file,
    map_file,
    image_topic,
    config_path,
    robot_config,
    world,
    use_image_features,
    groundtruth_bagfile,
    rmse_rel_start_time=0,
    rmse_rel_end_time=-1,
):
    output_dir = utilities.create_directory(output_dir)
    print(("Output directory for results is {}".format(output_dir)))

    value_ranges, value_names = make_value_ranges()
    parameter_sweep_utilities.save_values(value_names, value_ranges,
                                          "individual_value_ranges.csv",
                                          output_dir)

    all_value_combos = parameter_sweep_utilities.make_all_value_combinations(
        value_ranges)
    parameter_sweep_utilities.save_values(value_names, all_value_combos,
                                          "all_value_combos.csv", output_dir)

    parameter_sweep(
        all_value_combos,
        value_names,
        output_dir,
        bag_file,
        map_file,
        image_topic,
        config_path,
        robot_config,
        world,
        use_image_features,
        groundtruth_bagfile,
        rmse_rel_start_time,
        rmse_rel_end_time,
    )
    combined_results_file = os.path.join(output_dir,
                                         "param_sweep_combined_results.csv")
    value_combos_file = os.path.join(output_dir, "all_value_combos.csv")
    results_pdf_file = os.path.join(output_dir, "param_sweep_results.pdf")
    plot_parameter_sweep_results.create_plots(results_pdf_file,
                                              combined_results_file,
                                              value_combos_file)
    return output_dir
예제 #14
0
def distribute_statistics_local(product_id, source_path, destination_path):
    '''
    Description:
        Copies the statistics to the specified directory on the local system

    Parameters:
        product_id - The unique product ID associated with the files.
        source_path - The full path to where the statistics files to
                      distribute reside.
        destination_path - The full path on the local system to copy the
                           statistics files into.

    Note:
        - It is assumed a stats directory exists under the source_path
        - A stats directory will be created under the destination path
    '''

    logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER)

    d_name = 'stats'

    # Save the current directory location and change to the source directory
    current_directory = os.getcwd()
    os.chdir(source_path)

    try:
        stats_path = os.path.join(destination_path, d_name)
        stats_files = ''.join([d_name, '/', product_id, '*'])

        # Create the statistics directory under the destination path
        logger.info("Creating directory {0}".format(stats_path))
        utilities.create_directory(stats_path)

        # Remove any pre-existing statistics for this product ID
        cmd = ' '.join(['rm', '-f', os.path.join(destination_path,
                                                 stats_files)])
        output = ''
        try:
            output = utilities.execute_cmd(cmd)
        except Exception as e:
            raise ee.ESPAException(ee.ErrorCodes.distributing_product,
                                   str(e)), None, sys.exc_info()[2]
        finally:
            if len(output) > 0:
                logger.info(output)

        # Transfer the statistics files
        for file_path in glob.glob(stats_files):
            filename = os.path.basename(file_path)
            dest_file_path = os.path.join(stats_path, filename)

            logger.info("Copying {0} to {1}".format(filename, dest_file_path))
            shutil.copyfile(file_path, dest_file_path)

    except Exception as e:
        logger.exception("An exception occurred processing {0}".
                         format(product_id))
        e_code = ee.ErrorCodes.distributing_product
        raise ee.ESPAException(e_code, str(e)), None, sys.exc_info()[2]

    finally:
        # Change back to the previous directory
        os.chdir(current_directory)
예제 #15
0
def Deposito():
    import pandas as pd
    from utilities import Print_Error,get_files,Select_Menu,create_directory,OpenFile



    available_file = []

    search_locations = []
    save_location = ""
    main_config = open("CONFIG/MAIN.config")
    for line in main_config:
        if (line.split(';')[0] == "search_location"):
            search_locations.append(line.split(';')[1].strip())
        elif (line.split(';')[0] == "save_location"):
            save_location=line.split(';')[1].strip()

    for file in get_files(search_locations):
        if(file.upper().endswith('.XLS')):
            available_file.append(file)
    file_name = Select_Menu(available_file,"Select a File",return_type=int)
    file_name = available_file[(file_name)]




    if (file_name.upper().endswith(".XLS")):
        print "Importing XLS File!"
        sheet = "LinnerBooking"
        df = pd.read_excel(io=file_name, sheet_name=sheet)

        df = df[['Booking','Deposito','Weight','Tipo Ctr']]

        df = df.loc[(df['Deposito'] == "MEDLOG SAN ANTONIO") | (df['Deposito'] == "SITRANS SAI ALTO DEPOT")
        | (df['Deposito'] == "SITRANS VALPARAISO DEPOT")
        |(df['Deposito'] == "MEDLOG SANTIAGO")]

        df['Weight'] = df['Weight']/1000 #Transformar a Tons.
        # df = df.loc[(df['Tipo Ctr'] == '20DV') | (df['Tipo Ctr'] == '40DV') | (df['Tipo Ctr'] == '40HC')]
        table = pd.pivot_table(df,values='Weight',aggfunc='count',index='Deposito',columns='Tipo Ctr')
        table = table.reindex(columns=['20DV', '40DV', '40HC'])

        table = table.rename(index={'MEDLOG SAN ANTONIO':'SAI','SITRANS SAI ALTO DEPOT':'SAI',
                                    'SITRANS VALPARAISO DEPOT':'VAP','MEDLOG SANTIAGO':'STGO'})

        table = table.groupby('Deposito').sum()
        # print table.iloc[0]['20DV']



        import openpyxl
        import os

        wb = openpyxl.Workbook()
        sheet = wb.active

        list = []



        print table


        data = []

        for y in range(len(table.index)):
            data.append([])
            for x in range(len(table.columns)):
                data[-1].append(table.iloc[y][x])

        x = 1
        z = 0
        for deposit in data:
            r = 0
            sheet.cell(1,x,str(table.index[z]))
            for value in deposit:
                sheet.cell(2,x,str(table.columns[r]))
                sheet.cell(3,x,float(value))
                x+=1
                r+=1
            x+=1
            z+=1



        wb.save('demo.xlsx')
        wb.close()
        import subprocess




        if (save_location == ""):
            print "Saving Output in Program Location!"
        elif (not os.path.exists(save_location)):
            Print_Error("Save Directory Not Found!")
            create_directory(save_location)



        try:
            table.to_excel(save_location+'/file_output.xlsx')
            print "Saved Succesfully"
        except:
            Print_Error('Error Saving File!')


        directory =  os.getcwd() + '/demo.xlsx'
        OpenFile(directory)


    else:
        Print_Error("File not compatible!")
예제 #16
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--output-dir", help="output directory", type=str, required=True
    )
    parser.add_argument("--month", help="month", type=str, required=True)
    parser.add_argument("--year", help="year", type=str, required=True)
    parser.add_argument(
        "--import-csv",
        help="yes/no : Whether to import existing csv file.Default is 'no'",
        type=str,
        default="no",
    )
    args = parser.parse_args()
    main_dir = args.output_dir
    n_month, n_year = args.month.lower(), args.year
    work_dir = main_dir + "//" + n_month + "_" + n_year
    create_directory(main_dir)
    create_directory(work_dir)
    log_file_write = open(work_dir + "//scrape_en-hi_log_file.txt", mode="w")
    log_file_write.write(f"{n_month,n_year}\n")

    if args.import_csv.lower() == "yes":
        set_import = True
    elif args.import_csv.lower() == "no":
        set_import = False
    else:
        log_file_write.write(f"\n Please enter a valid option for import-csv")

    scrape_loc_en = work_dir + "//" + "scrape_file_en_" + n_month + "_" + n_year
    scrape_loc_hi = work_dir + "//" + "scrape_file_hi_" + n_month + "_" + n_year
    create_directory(scrape_loc_hi)
    create_directory(scrape_loc_en)
    url_file_loc = "file:///" + HTML_FOLDER + "//Press Information Bureau."
    filename_url_en = url_file_loc + "_en_" + n_month + "_" + n_year + ".html"
    filename_url_hi = url_file_loc + "_hi_" + n_month + "_" + n_year + ".html"

    ministy_pa_list = pd.read_csv(
        MINISTRY_NAME_PARALLEL_LOCATION,
        encoding="utf-16",
    )
    parse_url_en = get_html(filename_url_en)
    parse_url_hi = get_html(filename_url_hi)
    no_of_result_en = int(
        (parse_url_en.find("div", {"class": "search_box_result"}).contents[0]).split()[
            1
        ]
    )
    no_of_result_hi = int(
        (parse_url_hi.find("div", {"class": "search_box_result"}).contents[0]).split()[
            1
        ]
    )
    log_file_write.write(f"\nNo of search result in {n_month} of {n_year}:")
    log_file_write.write(f"\n English: {no_of_result_en} \n Hindi: {no_of_result_hi}")
    log_file_write.write(
        f"\nNo of Ministry in English search result:\
                         {len(parse_url_en.findAll('h3',{'class':'font104'}))}"
    )
    log_file_write.write(
        f"\nNo of Ministry in Hindi search result:\
                         {len(parse_url_hi.findAll('h3',{'class':'font104'}))}"
    )

    # Import or Create english dataframe
    df_en = get_data(
        n_month,
        n_year,
        filename_url_en,
        ministy_pa_list,
        "en",
        log_file_write,
        import_data=set_import,
        import_data_dir=work_dir,
    )
    if "PRID" not in df_en.columns.tolist():
        df_en["PRID"] = df_en["Link"].apply(lambda x: x.split("=")[-1])
    log_file_write.write(f"\n English Datframe \n")
    log_file_write.write(f"\n Datframe Info:\n")
    df_en.info(buf=log_file_write)

    # Write the English Dataframe
    df_en.to_csv(
        os.path.join(work_dir, "English_data_" + n_month + "_" + n_year + ".csv"),
        index=False,
        encoding="utf-16",
    )

    # Scraping English Documents
    iter_f = df_en.shape[0]
    log_file_write.write("\nStarting scraping for English Document")
    for i in range(iter_f):
        en_scrape_file = (
            scrape_loc_en
            + "//"
            + str(i).zfill(4)
            + "_en_"
            + "_".join(df_en.loc[i, ["English_Ministry_Name"]].values[0].split())
            + "_"
            + df_en.loc[i, ["Posting_Date"]].values[0].strftime("%Y-%m-%d")
            + "_"
            + str(df_en.loc[i, ["PRID"]].values[0])
            + ".txt"
        )
        m = 0
        while m == 0:
            try:
                b = get_html(df_en.Link[i], "lxml")
                m = b.body.form.find(
                    "div", {"class": "innner-page-main-about-us-content-right-part"}
                )
            except:
                log_file_write.write("\nerror:retrying")
                m = 0
        if m is None:
            log_file_write.write(
                f"\nindex: {i}, Link: {df_en.Link[i]}, no english content found"
            )
            continue
        k_en = [
            str(k.get_text()).strip()
            for k in m.findAll(
                [
                    "div",
                    "tr",
                    "td",
                    "p",
                    "ol",
                    "h2",
                    "h3",
                    "h4",
                    "ul",
                    "pre",
                    "span",
                    "li",
                ]
            )
            if len(
                k.find_parents(["p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li"])
            )
            == 0
        ]
        if len(k_en) == 0:
            log_file_write.write(
                f"\nindex: {i}, Link: {df_en.Link[i]},no English content in variuos tags"
            )
            continue
        log_file_write.write(f"\nindex: {i}, number of lines: {len(k_en)}")
        write_scrape_text_file(en_scrape_file, k_en, df_en.English_Ministry_Name[i])
    log_file_write.write(f"\nDone scraping for English Document")

    # Import or Create hindi dataframe
    df_hi = get_data(
        n_month,
        n_year,
        filename_url_hi,
        ministy_pa_list,
        "hi",
        log_file_write,
        import_data=set_import,
        import_data_dir=work_dir,
    )
    if "PRID" not in df_hi.columns.tolist():
        df_hi["PRID"] = df_hi["Link"].apply(lambda x: x.split("=")[-1])
    log_file_write.write(f"\nHindi Datframe \n")
    log_file_write.write(f"\nDatframe Info:\n")
    df_hi.info(buf=log_file_write)

    # Write the Hindi Dataframe
    df_hi.to_csv(
        os.path.join(work_dir, "Hindi_data_" + n_month + "_" + n_year + ".csv"),
        index=False,
        encoding="utf-16",
    )

    # Scraping Hindi Documents
    iter_f = df_hi.shape[0]
    log_file_write.write("\nStarting scraping for Hindi Document")
    for i in range(iter_f):
        hi_scrape_file = (
            scrape_loc_hi
            + "//"
            + str(i).zfill(4)
            + "_hi_"
            + "_".join(df_hi.loc[i, ["English_Ministry_Name"]].values[0].split())
            + "_"
            + df_hi.loc[i, ["Posting_Date"]].values[0].strftime("%Y-%m-%d")
            + "_"
            + str(df_hi.loc[i, ["PRID"]].values[0])
            + ".txt"
        )
        m = 0
        while m == 0:
            try:
                b = get_html(df_hi.Link[i], "lxml")
                m = b.body.form.find(
                    "div", {"class": "innner-page-main-about-us-content-right-part"}
                )
            except:
                log_file_write.write("\nerror:retrying")
                m = 0
        if m is None:
            log_file_write.write(
                f"\nindex: {i}, Link: {df_hi.Link[i]}, no hindi content found"
            )
            continue
        k_hi = [
            str(k.get_text()).strip()
            for k in m.findAll(
                [
                    "div",
                    "tr",
                    "td",
                    "p",
                    "ol",
                    "h2",
                    "h3",
                    "h4",
                    "ul",
                    "pre",
                    "span",
                    "li",
                ]
            )
            if len(
                k.find_parents(["p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li"])
            )
            == 0
        ]
        if len(k_hi) == 0:
            log_file_write.write(
                f"\nindex: {i}, Link: {df_hi.Link[i]},no hindi content in variuos tags"
            )
            continue
        log_file_write.write(f"\nindex: {i}, number of lines: {len(k_hi)}")
        write_scrape_text_file(hi_scrape_file, k_hi, df_hi.Hindi_Ministry_Name[i])
    log_file_write.write("\nDone scraping for Hindi Document")
    log_file_write.close()
def distribute_statistics_local(immutability, product_id, source_path,
                                destination_path):
    '''
    Description:
        Copies the statistics to the specified directory on the local system

    Parameters:
        product_id - The unique product ID associated with the files.
        source_path - The full path to where the statistics files to
                      distribute reside.
        destination_path - The full path on the local system to copy the
                           statistics files into.

    Note:
        - It is assumed a stats directory exists under the source_path
        - A stats directory will be created under the destination path
    '''

    logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER)

    d_name = 'stats'

    # Save the current directory location and change to the source directory
    current_directory = os.getcwd()
    os.chdir(source_path)

    try:
        stats_wildcard = ''.join([product_id, '*'])
        stats_path = os.path.join(destination_path, d_name)
        stats_files = os.path.join(d_name, stats_wildcard)
        dest_stats_wildcard = os.path.join(stats_path, stats_wildcard)

        # Create the statistics directory under the destination path
        logger.info("Creating directory {0}".format(stats_path))
        utilities.create_directory(stats_path)

        # Change the attributes on the files so that we can remove them
        if immutability:
            cmd = ' '.join(['sudo', 'chattr', '-if', dest_stats_wildcard])
            output = ''
            try:
                output = utilities.execute_cmd(cmd)
            except Exception:
                pass
            finally:
                if len(output) > 0:
                    logger.info(output)

        # Remove any pre-existing statistics for this product ID
        cmd = ' '.join(['rm', '-f', dest_stats_wildcard])
        output = ''
        try:
            output = utilities.execute_cmd(cmd)
        finally:
            if len(output) > 0:
                logger.info(output)

        # Transfer the statistics files
        for file_path in glob.glob(stats_files):
            filename = os.path.basename(file_path)
            dest_file_path = os.path.join(stats_path, filename)

            logger.info("Copying {0} to {1}".format(filename, dest_file_path))
            shutil.copyfile(file_path, dest_file_path)

        # Change the attributes on the files so that we can't remove them
        if immutability:
            cmd = ' '.join(['sudo', 'chattr', '+i', dest_stats_wildcard])
            output = ''
            try:
                output = utilities.execute_cmd(cmd)
            finally:
                if len(output) > 0:
                    logger.info(output)

    except Exception:
        logger.exception('An exception occurred processing {0}'.
                         format(product_id))
        raise

    finally:
        # Change back to the previous directory
        os.chdir(current_directory)
def package_product(immutability, source_directory, destination_directory,
                    product_name):
    '''
    Description:
      Package the contents of the source directory into a gzipped tarball
      located in the destination directory and generates a checksum file for
      it.

      The filename will be prefixed with the specified product name.

    Returns:
      product_full_path - The full path to the product including filename
      cksum_full_path - The full path to the check sum including filename
      cksum_value - The checksum value
    '''

    logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER)

    product_full_path = os.path.join(destination_directory, product_name)

    # Make sure the directory exists.
    utilities.create_directory(destination_directory)

    # Remove any pre-existing files
    # Grab the first part of the filename, which is not unique
    filename_parts = product_full_path.split('-')
    filename_parts[-1] = '*'  # Replace the last element of the list
    filename = '-'.join(filename_parts)  # Join with '-'

    # Name of the checksum to be created
    cksum_filename = '.'.join([product_name, settings.ESPA_CHECKSUM_EXTENSION])

    # Change the attributes on the files so that we can remove them
    if immutability:
        cmd = ' '.join(['sudo', 'chattr', '-if', filename, cksum_filename])
        output = ''
        try:
            output = utilities.execute_cmd(cmd)
        except Exception:
            pass
        finally:
            if len(output) > 0:
                logger.info(output)

    # Remove the file first just in-case this is a second run
    cmd = ' '.join(['rm', '-f', filename])
    output = ''
    try:
        output = utilities.execute_cmd(cmd)
    finally:
        if len(output) > 0:
            logger.info(output)

    # Change to the source directory
    current_directory = os.getcwd()
    os.chdir(source_directory)

    try:
        # Tar the files
        logger.info("Packaging completed product to %s.tar.gz"
                    % product_full_path)

        # Grab the files to tar and gzip
        product_files = glob.glob("*")

        # Execute tar with zipping, the full/path/*.tar.gz name is returned
        product_full_path = utilities.tar_files(product_full_path,
                                                product_files, gzip=True)

        # Change file permissions
        logger.info("Changing file permissions on %s to 0644"
                    % product_full_path)
        os.chmod(product_full_path, 0644)

        # Verify that the archive is good
        output = ''
        cmd = ' '.join(['tar', '-tf', product_full_path])
        try:
            output = utilities.execute_cmd(cmd)
        finally:
            if len(output) > 0:
                logger.info(output)

        # If it was good create a checksum file
        cksum_output = ''
        cmd = ' '.join([settings.ESPA_CHECKSUM_TOOL, product_full_path])
        try:
            cksum_output = utilities.execute_cmd(cmd)
        finally:
            if len(cksum_output) > 0:
                logger.info(cksum_output)

        # Get the base filename of the file that was checksum'd
        cksum_prod_filename = os.path.basename(product_full_path)

        logger.debug("Checksum file = %s" % cksum_filename)
        logger.debug("Checksum'd file = %s" % cksum_prod_filename)

        # Make sure they are strings
        cksum_values = cksum_output.split()
        cksum_value = "%s %s" % (str(cksum_values[0]),
                                 str(cksum_prod_filename))
        logger.info("Generating cksum: %s" % cksum_value)

        cksum_full_path = os.path.join(destination_directory, cksum_filename)

        try:
            with open(cksum_full_path, 'wb+') as cksum_fd:
                cksum_fd.write(cksum_value)
        except Exception:
            logger.exception('Error building checksum file')
            raise

    finally:
        # Change back to the previous directory
        os.chdir(current_directory)

    return (product_full_path, cksum_full_path, cksum_value)
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--output-dir", help="output directory", type=str, required=True
    )
    parser.add_argument("--month", help="month", type=str, required=True)
    parser.add_argument("--year", help="year", type=str, required=True)
    args = parser.parse_args()

    n_month, n_year = str(args.month).lower(), str(args.year)
    work_dir = args.output_dir + "//" + n_month + "_" + n_year
    align_loc = work_dir + "//" + "align_" + n_month + "_" + n_year
    tokenize_loc_en = work_dir + "//" + "tokenize_en_" + n_month + "_" + n_year
    tokenize_loc_hi = work_dir + "//" + "tokenize_hi_" + n_month + "_" + n_year
    submit_aligner = work_dir + "//" + "submit_aligner_" + n_month + "_" + n_year
    en_data_file = "_".join(["English", "data", n_month, n_year]) + ".csv"
    hi_data_file = "_".join(["Hindi", "data", n_month, n_year]) + ".csv"

    create_directory(align_loc)
    create_directory(submit_aligner)

    df_en = pd.read_csv(work_dir + "//" + en_data_file, encoding="utf-16")
    df_hi = pd.read_csv(work_dir + "//" + hi_data_file, encoding="utf-16")
    df_en = preprocess_dataframe(df_en)
    df_hi = preprocess_dataframe(df_hi)
    df_en.to_csv(work_dir + "//" + en_data_file, index=False, encoding="utf-16")
    df_hi.to_csv(work_dir + "//" + hi_data_file, index=False, encoding="utf-16")

    # Crete files which are parallel based on Ministry Name and Posting Date
    k_hi = pd.DataFrame(
        df_hi[["English_Ministry_Name", "Posting_Date", "index"]]
        .groupby(["English_Ministry_Name", "Posting_Date"])["index"]
        .apply(lambda x: x.tolist())
    )
    k_en = pd.DataFrame(
        df_en[["English_Ministry_Name", "Posting_Date", "index"]]
        .groupby(["English_Ministry_Name", "Posting_Date"])["index"]
        .apply(lambda x: x.tolist())
    )
    k_merge = pd.merge(
        k_en,
        k_hi,
        left_index=True,
        right_index=True,
        how="inner",
        suffixes=("_en", "_hi"),
    )
    k_merge.to_csv(
        work_dir + "//" + "submit_aligner_" + n_month + "_" + n_year + ".csv",
        index=True,
        encoding="utf-16",
    )

    fl_tok_en = sorted(glob.glob(tokenize_loc_en + "//" + "*.txt"))
    fl_tok_hi = sorted(glob.glob(tokenize_loc_hi + "//" + "*.txt"))
    no_sen_df = pd.DataFrame(
        columns=[
            "Filename_en",
            "Total_sentences_en",
            "Filename_hi",
            "Total_sentences_hi",
        ]
    )
    for count, i in enumerate(k_merge.iterrows()):
        en_align_file = (
            submit_aligner
            + "//subalign_"
            + str(count).zfill(4)
            + "_en_"
            + "_".join(i[0][0].split())
            + "_"
            + i[0][1].strftime("%Y-%m-%d")
            + ".txt"
        )
        hi_align_file = (
            submit_aligner
            + "//subalign_"
            + str(count).zfill(4)
            + "_hi_"
            + "_".join(i[0][0].split())
            + "_"
            + i[0][1].strftime("%Y-%m-%d")
            + ".txt"
        )
        with open(en_align_file, encoding="utf-16", mode="w") as flw_en:
            count_en = 0
            for ind in i[1]["index_en"]:
                with open(fl_tok_en[ind], encoding="utf-16", mode="r") as flr_en:
                    k_en = flr_en.read()
                    count_en += k_en.count("\n")
                    flw_en.write(k_en)
        with open(hi_align_file, encoding="utf-16", mode="w") as flw_hi:
            count_hi = 0
            for ind in i[1]["index_hi"]:
                with open(fl_tok_hi[ind], encoding="utf-16", mode="r") as flr_hi:
                    k_hi = flr_hi.read()
                    count_hi += k_hi.count("\n")
                    flw_hi.write(k_hi)
        no_sen_df = no_sen_df.append(
            {
                "Filename_en": os.path.basename(en_align_file),
                "Total_sentences_en": count_en,
                "Filename_hi": os.path.basename(hi_align_file),
                "Total_sentences_hi": count_hi,
            },
            ignore_index=True,
        )
        print(
            f"Writing {os.path.basename(en_align_file)} and {os.path.basename(hi_align_file)} done"
        )
    no_sen_df.to_csv(
        work_dir + "//" + "tok_sen_count_" + n_month + "_" + n_year + ".csv",
        index=False,
        encoding="utf-16",
    )
    fl_list = glob.glob(submit_aligner + "//" + "*.txt")
    en_fl = sorted([i for i in fl_list if os.path.basename(i).split("_")[2] == "en"])
    hi_fl = sorted([i for i in fl_list if os.path.basename(i).split("_")[2] == "hi"])
    c_fl = list(zip(en_fl, hi_fl))
    for i in c_fl:
        extract_bitext(BEARER_TOKEN, align_loc, i[0], i[1])
예제 #20
0
def scrape_pib_archives(df_data, month, year, lang, out_dir, list_ministry):
    """
    Scrape text using the links provided in dataframe
    Create and write a new dataframe with postin date-time
    """
    print(
        f'Scraping for {month}, {year}, {"English" if lang=="en" else "Hindi"}'
    )
    n_month, n_year = str(month), str(year)
    n_month = n_month.lower()
    main_dir = out_dir
    work_dir = main_dir + "//" + n_month + "_" + n_year
    create_directory(work_dir)
    if lang == "en":
        language = "English"
        language_2 = "Hindi"
        scrape_loc = work_dir + "//" + "scrape_file_en_" + n_month + "_" + n_year
    elif lang == "hi":
        language = "Hindi"
        language_2 = "English"
        scrape_loc = work_dir + "//" + "scrape_file_hi_" + n_month + "_" + n_year
    else:
        print("Pass valid language code")
        return None
    create_directory(scrape_loc)
    df_data[language + "_Ministry_Name"] = [""] * df_data.shape[0]
    df_data[language_2 + "_Ministry_Name"] = [""] * df_data.shape[0]
    df_data["Posting_Datetime"] = [pd.to_datetime(np.nan)] * df_data.shape[0]
    df_data["Posting_Date"] = df_data["Posting_Datetime"].apply(
        lambda x: x.date())
    for p_th in range(df_data.shape[0])[:]:
        b_source = get_html(df_data.loc[p_th, "Link"])
        m_dt = b_source.find("div", attrs={"class": "mddiv content-ministry"})
        m = b_source.find("div", attrs={"class": "contentdiv"})
        df_data.at[p_th, language + "_Ministry_Name"] = str(" ".join(
            m_dt.contents[0].strip().split()))
        if (str(" ".join(m_dt.contents[0].strip().split()))
                not in list_ministry[language +
                                     "_Ministry_Name"].values.tolist()):
            print(
                "Ministry name missing:",
                str(" ".join(m_dt.contents[0].strip().split())),
            )
        else:
            df_data.at[p_th, language_2 + "_Ministry_Name"] = list_ministry[
                list_ministry[language + "_Ministry_Name"] == df_data.at[
                    p_th, language +
                    "_Ministry_Name"]][language_2 + "_Ministry_Name"].values[0]

        df_data.at[p_th, "Posting_Datetime"] = pd.to_datetime(
            (" ".join(m_dt.contents[1].text.split()[:-1])).replace(
                ".", ":").replace(": ", ":"))
        df_data.at[p_th,
                   "Posting_Date"] = df_data.at[p_th,
                                                "Posting_Datetime"].date()
        scrape_file = (
            scrape_loc + "//" + str(p_th).zfill(4) + "_" + lang + "_" +
            "_".join(df_data.loc[p_th,
                                 ["English_Ministry_Name"]].values[0].split())
            + "_" +
            df_data.loc[p_th, ["Posting_Date"]].values[0].strftime("%Y-%m-%d")
            + "_" + df_data.loc[p_th, "Link"].split("=")[-1] + ".txt")
        k_en = [
            str(k.get_text()).strip() for k in m.findAll([
                "div",
                "tr",
                "td",
                "p",
                "ol",
                "h2",
                "h3",
                "h4",
                "ul",
                "pre",
                "span",
                "li",
            ]) if len(
                k.find_parents([
                    "p", "ol", "h2", "h3", "h4", "ul", "pre", "span", "li"
                ])) == 0
        ]
        with open(scrape_file, mode="w", encoding="utf-16") as file_w:
            for line in k_en:
                if "@font-face" in line.strip():
                    continue
                line = re.sub("\r\n-", "\n-", line)
                line = re.sub("\.\s+\r\n", ".\n", line)
                #         print(line)
                line = re.sub(":\s+\r\n", ":\n", line)
                line = re.sub(";\s+\r\n", ";\n", line)
                line = line.replace("\r\n", " ")
                for ln in line.split("\n"):
                    ln = ln.strip()
                    if len(ln.strip()) == 0:
                        continue
                    if "@font-face" in ln.strip():
                        continue
                    ln = " ".join(ln.split())

                    file_w.write(ln.strip().replace("\r", "") + "\n")
    print(df_data.info())
    if True:
        df_data.to_csv(
            os.path.join(work_dir, language + "_data_" + n_month + "_" +
                         n_year + ".csv"),
            index=False,
            encoding="utf-16",
        )
예제 #21
0
    param_range_directory_for_bag = parameter_sweep.make_values_and_parameter_sweep(
      bag_output_dir, graph_bag_params.bagfile, graph_bag_params.map_file, graph_bag_params.image_topic,
      graph_bag_params.config_path, graph_bag_params.robot_config_file, graph_bag_params.world,
      graph_bag_params.use_image_features)
    if not param_range_directory:
      param_range_directory = param_range_directory_for_bag
    combined_results_csv_files.append(os.path.join(bag_output_dir, 'param_sweep_combined_results.csv'))
  average_parameter_sweep_results(combined_results_csv_files, output_dir)
  save_ranges(param_range_directory, output_dir)


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument('config_file')
  parser.add_argument('output_dir')
  args = parser.parse_args()
  if not os.path.isfile(args.config_file):
    print('Config file ' + args.config_file + ' does not exist.')
    sys.exit()
  if os.path.isdir(args.output_dir):
    print('Output directory ' + args.output_dir + ' already exists.')
    sys.exit()
  output_dir = utilities.create_directory(args.output_dir)

  graph_bag_params_list = bag_sweep.load_params(args.config_file)
  bag_and_parameter_sweep(graph_bag_params_list, output_dir)
  combined_results_file = os.path.join(output_dir, 'bag_and_param_sweep_stats.csv')
  value_combos_file = os.path.join(output_dir, 'all_value_combos.csv')
  results_pdf_file = os.path.join(output_dir, 'bag_and_param_sweep_results.pdf')
  plot_parameter_sweep_results.create_plots(results_pdf_file, combined_results_file, value_combos_file)
예제 #22
0
                         n_year + ".csv"),
            index=False,
            encoding="utf-16",
        )


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--output-dir",
                        help="output directory",
                        type=str,
                        required=True)
    parser.add_argument("--month", help="month", type=str, required=True)
    parser.add_argument("--year", help="year", type=str, required=True)
    args = parser.parse_args()
    create_directory(args.output_dir)
    # creating release id and url link datframe
    # also creating ministry list
    df_en, list_ministry_en = get_prid_and_ministry_list(
        args.month, args.year, "en")
    df_hi, list_ministry_hi = get_prid_and_ministry_list(
        args.month, args.year, "hi")
    if len(list_ministry_en) == len(list_ministry_en):
        print(len(list_ministry_en), len(list_ministry_en))
        ministry_data = pd.DataFrame(
            list(zip(list_ministry_en, list_ministry_hi)),
            columns=["English_Ministry_Name", "Hindi_Ministry_Name"],
        )
        # Scraping the url links
        scrape_pib_archives(df_en, args.month, args.year, "en",
                            args.output_dir, ministry_data)