예제 #1
0
def main():
    parser = _get_parser()
    opt = parser.parse_args()
    log_info(opt)

    preprocess(opt.dataset_path, opt.preprocess_mode)

    if opt.output_unit == 'character':
        generate_character_labels(opt.dataset_path, opt.labels_dest)
        generate_character_script(opt.dataset_path, opt.new_path,
                                  opt.script_prefix, opt.labels_dest)

    elif opt.output_unit == 'subword':
        generate_sentencepiece_input(opt.dataset_path)
        if not opt.use_pretrain_kobert_tokenizer:
            train_sentencepiece(opt.dataset_path, opt.vocab_size)
        generate_subword_labels('aihub_sentencepiece.vocab', opt.labels_dest,
                                opt.use_pretrain_kobert_tokenizer)
        generate_subword_script(opt.dataset_path, opt.new_path,
                                opt.script_prefix)

    elif opt.output_unit == 'grapheme':
        character_to_grapheme(opt.dataset_path, opt.grapheme_save_path)
        generate_grapheme_labels(opt.grapheme_save_path, opt.labels_dest)
        generate_grapheme_script(opt.grapheme_save_path, opt.new_path,
                                 opt.script_prefix, opt.labels_dest)

    else:
        raise ValueError("Unsupported preprocess method : {0}".format(
            opt.output_unit))

    gather_files(opt.dataset_path, opt.new_path)
예제 #2
0
def get_children(children: []) -> [Node]:
    nodes = []
    for child in children:
        nodes.append(
            Node(child["question"], child["typical_answer"],
                 [[preprocess(x[0]), preprocess(x[1])]
                  for x in child["solutions"]],
                 get_children(child["children"])))

    return nodes
예제 #3
0
파일: utils.py 프로젝트: psm8/chatbot-ninja
def ask_if_it_helped(solution):
    print(solution.text)
    yes = preprocess.preprocess("yes")
    no = preprocess.preprocess("no")

    answer = preprocess.preprocess(input("Was it helpfull? "))

    if yes.similarity(answer) - no.similarity(answer) > 0:
        print("You're welcome")
        return True

    else:
        return False
예제 #4
0
파일: node.py 프로젝트: psm8/chatbot-ninja
    def __init__(self, data, answer, solutions=None, children=None):

        self.data = SpaCyTreeNode(preprocess.preprocess(data),
                                  preprocess.preprocess(answer))
        self.parent = None
        self.children = []
        self.level = 0
        self.base_boarder = 0.75
        self.base_boarder_full_search = 0.91

        if solutions is not None:
            for solution in solutions:
                self.data.add_solution(solution[0], solution[1])
        if children is not None:
            for child in children:
                self.addChild(child)
예제 #5
0
def convert_sen3(sen3_file):
    """
    Convert sentinel3 data to reflectance
    Args:
        sen3_file:
        epsg:

    Returns:
        tuple with data bands as M x N np.arrays:
        (
            S1_reflectance_an,
            S2_reflectance_an,
            S3_reflectance_an,
            S4_reflectance_an,
            S5_reflectance_an,
            S6_reflectance_an,
            S7_BT_in,
            S8_BT_in,
            S9_BT_in,
        )
        Affine transform

    """
    sen3_file = Path(sen3_file)
    cfg = conftools.load_directory(Path(__file__).parent / "config")
    cfg['workdir'] = sen3_file.parents[0]
    cfg['tmpdir'] = sen3_file.parents[0]
    ofile = preprocess(sen3_file, cfg, overwrite=False)

    data_channels, s3_transform = read_ofile(ofile)

    #Todo: remove tmp-file

    return data_channels, s3_transform
    def get_fa_scores(self,
                      df,
                      doc_colname,
                      save_path=None,
                      tfidf=False,
                      format="virtue_vice"):
        df = df.reset_index(drop=True)
        docs = df[doc_colname]
        print(f'Preprocessing column {doc_colname}')
        docs = preprocess(docs).reset_index(drop=True)
        baseline_docs = [
        ]  # todo docs.sample(frac=0.3, random_state=157).reset_index(drop=True)
        # todo build the w2v model
        print('Let\'s calculate bias and intensity')
        bias, intensity = self.doc_scores(docs=docs,
                                          baseline_docs=baseline_docs,
                                          tfidf=tfidf)
        print('total size: ', df.shape[0])
        print('any NaN in bias?',
              np.isnan(bias.values).sum()
              )  # Nan means empty docs, we should remove them
        print('any NaN in intensity?', np.isnan(intensity.values).sum())

        fa_scores = pd.concat([df, bias, intensity], axis=1)

        fa_scores = fa_scores.dropna(subset=bias.columns.tolist() +
                                     intensity.columns.tolist()).reset_index(
                                         drop=True)
        print('NAN scores dropped, new size:', fa_scores.shape[0])

        if format == "virtue_vice":
            df_virtue_vice = []
            for index, row in fa_scores.iterrows():
                row_virtue_vice = {}
                for mf in self.axes.keys():
                    if row[f'bias_{mf}'] < 0:
                        row_virtue_vice[f'{mf}.vice'] = row[f'intensity_{mf}']
                        row_virtue_vice[f'{mf}.virtue'] = 0
                    else:
                        row_virtue_vice[f'{mf}.virtue'] = row[
                            f'intensity_{mf}']
                        row_virtue_vice[f'{mf}.vice'] = 0
                df_virtue_vice.append(row_virtue_vice)

            df_virtue_vice = pd.DataFrame(df_virtue_vice)
            fa_scores = pd.concat([fa_scores, df_virtue_vice], axis=1)
            print('After addding vice-virtue scores, the shape:',
                  fa_scores.shape)

        if save_path:
            if len(save_path.split('/')) > 1:
                output_dir = '/'.join(save_path.split('/')[:-1])
                Path(output_dir).mkdir(parents=True, exist_ok=True)
            fa_scores.to_csv(save_path, index=None, header=True)
            print('Moral Foundations FrameAxis scores saved to {}'.format(
                save_path))
        else:
            print('not saving the fa scores.')
        return fa_scores
예제 #7
0
파일: utils.py 프로젝트: psm8/chatbot-ninja
def similarity_with_wrong_answer(self, user_input):
    self.you_are_wrong_answer = [
        preprocess.preprocess(x) for x in [
            "That is not what I meant", "You are wrong",
            "You are misunderstanding me", "You don't understand"
        ]
    ]
    return max([x.similarity(user_input) for x in self.you_are_wrong_answer])
예제 #8
0
def main():
    parser = _get_parser()
    opt = parser.parse_args()
    log_info(opt)

    audio_paths, transcripts = preprocess(opt.dataset_path, opt.preprocess_mode)

    if opt.output_unit == 'character':
        generate_character_labels(transcripts, opt.vocab_dest)
        generate_character_script(audio_paths, transcripts, opt.vocab_dest)

    elif opt.output_unit == 'subword':
        train_sentencepiece(transcripts, opt.savepath, opt.vocab_size)
        sentence_to_subwords(audio_paths, transcripts, opt.savepath)

    elif opt.output_unit == 'grapheme':
        sentence_to_grapheme(audio_paths, transcripts, opt.vocab_dest)

    else:
        raise ValueError("Unsupported preprocess method : {0}".format(opt.output_unit))
예제 #9
0
파일: x.py 프로젝트: Discookie/telekom-bead
def _build(difficulty: Union[int, bool],
           thread_count: int,
           is_silent: bool,
           build_files: Optional[List[str]] = None) -> bool:
    try:
        with open("meta/build.json", "r") as f:
            build_params = json.load(f)
    except:
        print("meta/build.json does not exist", file=sys.stderr)
        return False

    if difficulty is True:
        difficulty = build_params.get("release-opt", 9)
    elif difficulty is False:
        difficulty = build_params.get("debug-opt", 6)

    files_to_build: List[Dict[str, Any]]

    if build_files is None:
        files_to_build = build_params.get("files", [])
    else:
        files_to_build = [
            file for file in build_params.get("files", [])
            if file["path"] in build_files
        ]

    file_sizes: Dict[str, int] = {}

    if not path.exists("build/"):
        mkdir("build/")

    for file in files_to_build:
        if "path" not in file:
            print("Malformed JSON", file=sys.stderr)
            return False

        # Step 0: Read file
        try:
            with open("src/" + file["path"], "r") as f:
                step_0 = f.read()
        except:
            print(f'File {file["path"]} does not exist', file=sys.stderr)
            return False

        if not is_silent:
            print("Processing file", file["path"], file=sys.stderr)

        step_0_size = len(step_0)

        # Step 1: Find chars
        step_1_vars = file.get("variables", [])
        step_1_aliases = file.get("aliases", {})
        step_1_excluded = file.get("excluded", [])

        try:
            step_1_size, step_1_list, _ = find_chars(step_0,
                                                     step_1_vars,
                                                     step_1_aliases,
                                                     step_1_excluded,
                                                     difficulty,
                                                     thread_count,
                                                     silent=is_silent)
        except Exception as e:
            print("Step 1 failed: crash", file=sys.stderr)
            print(e, file=sys.stderr)
            return False

        if len(step_1_list) == 0:
            print("Step 1 failed: no combinations", file=sys.stderr)
            return False

        step_1 = step_0
        for src, tgt in step_1_list[0]:
            step_1 = step_1.replace(src, tgt)

        if not is_silent:
            print("Step 1 complete", file=sys.stderr)

        # Step 2: Preprocess and template code
        if "template" in file:
            template = file["template"]
        else:
            template = "template.py"
        with open("preprocess/" + template, "rb") as f:
            step_2_template = f.read()

        step_2_compressed_code = preprocess(step_1, step_1_list[0])

        step_2 = step_2_template.replace(b"{{code}}", step_2_compressed_code)
        step_2_size = len(step_2)

        if not is_silent:
            print("Step 2 complete", file=sys.stderr)

        # Note that the file is overwritten
        with open("build/" + file["path"], "wb") as f:
            f.write(step_2)

        file_sizes[file["path"]] = step_2_size

        if not is_silent:
            print(f'===', file=sys.stderr)
            print(f'File {file["path"]}:', file=sys.stderr)
            print(f'   {step_0_size} => {step_1_size} characters',
                  file=sys.stderr)
            print(f'   Final size: {step_2_size}', file=sys.stderr)
            print(f'===', file=sys.stderr)

    if not is_silent:
        print("Cumulative size:", sum(file_sizes.values()), file=sys.stderr)

    return True
예제 #10
0
# -*- coding: utf-8 -*-
from preprocess.preprocess import preprocess

preprocess()
예제 #11
0
import os
import time

LOAD_DATA = False
INDEX_NAME = 'qa'
PREPROCESS = False

GENERATE_VECT = True
TOPN = 20
org_input_file = '../data/kuaixue_org.csv'
input_file = '../data/kuaixue_p.csv'

stop_words_file = '../data/stopwords.txt'
my_vect_file = '../data/question.word2vec.bin'

if __name__ == "__main__":
    if PREPROCESS:
        preprocess.preprocess(org_input_file, input_file)

    if GENERATE_VECT:
        word2vect.generate_model(input_file, stop_words_file, my_vect_file)

    if LOAD_DATA:
        if os.path.exists(input_file):
            es_mode = es_model.ES_Model(input_file, INDEX_NAME,
                                        stop_words_file, True)
        else:
            print("Input file is not exist")
    else:
        #es_mode = es_model.ES_Model(input_file, INDEX_NAME, stop_words_file ,False)
        pass
def run_preprocess(cfg_mysql, cfg_pipeline, cfg_course, cfg_mysql_script_path):
    print "###### Step 1: Pre-processing database"
    preprocess.preprocess(cfg_mysql, cfg_pipeline, cfg_course,
                          cfg_mysql_script_path)
    print "Done"
예제 #13
0
파일: utils.py 프로젝트: psm8/chatbot-ninja
def get_doc_from_input(message):
    user_input = input(message)
    return preprocess.preprocess(user_input)
예제 #14
0
import os
import extraction.extract_micr as extract
import preprocess.preprocess as p
import numpy as np
import imutils
import skimage
import argparse

ap = argparse.ArgumentParser()
ap.add_argument('--image', required=True, help='Absolute path of image')
args = vars(ap.parse_args())

# directory = os.getcwd()
# data_directory = directory + '/cheques/'
# print(f'Images available: {os.listdir(data_directory)}')

input_ = args['image']

image = input_
print(f'Extracting from: {image}')

preprocessed_img = p.preprocess(image_path=image)
extracted_micr, contour_img = extract.extract_micr(image=preprocessed_img)

print(f'MICR Code: {extracted_micr}')
cv2.imwrite('ocr_cheque.jpg', contour_img)
cv2.imshow('Detected MICR code', cv2.resize(contour_img, (1000, 400)))
cv2.waitKey(0)
cv2.destroyAllWindows()
print('Exiting...')
def up_to_july(professions):
    """
    Runs normal, inter-year mobility analysis using only data for the first seven months of each year, i.e. up to July.

    :param professions: dict where key is profession name and value is base path to month-level data table
    :return: None
    """

    for prof, path in professions.items():

        # THIS IS FOR RUNNING ANALYSES USING ONLY MONTH DATA UP TO JULY

        # NB: for the sampler below to work properly (i.e. always try to sample July) you need to change the
        # value for "judges" in function preprocess.sample.get_sampling_month rom 4 to 7. Otherwise it tries to sample
        # April, for judges. No such issue for prosecutors.

        # for each year, sample by throwing out all observations occurring in months AFTER July
        with open(path, 'r') as infile:
            person_month_table = list(csv.reader(infile))[1:]
        sampled_years = [y for y in range(2006, 2021)]
        sampled_table = sample.mo_yr_sample(person_month_table, prof, [1, 2, 3, 4, 5, 6, 7], sampled_years)
        # write sampled table to disk
        samp_file_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' + 'sampled_collected/'
        with open(samp_file_dir + prof + '_to_july_sampled_month.csv', 'w') as out_file:
            writer = csv.writer(out_file)
            [writer.writerow(pm) for pm in sampled_table]

        # run preprocessor on sampled data
        prep_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' + 'preprocessed/'
        prep_file_path = prep_dir + prof + '_preprocessed.csv'
        std_log_path = prep_dir
        pids_log_path = prep_dir
        preprocess.preprocess(samp_file_dir, prep_file_path, std_log_path, pids_log_path, prof)

        # get descriptor tables using the preprocessed data
        descr_out_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' + 'descriptors/'
        with open(prep_file_path, 'r') as in_f:
            table = list(csv.reader(in_f))[1:]

        start_year, end_year = 2006, 2020

        # make table of total counts per year
        describe.year_counts_table(table, start_year, end_year, prof, descr_out_dir)

        # make tables of total counts per year, per level in judicial hierarchy
        describe.year_counts_table(table, start_year, end_year, prof, descr_out_dir, unit_type='nivel')

        # make tables for entry and  exit cohorts, per year, per gender, per level in judicial hierarchy
        describe.entry_exit_gender(table, start_year, end_year, prof, descr_out_dir, entry=False, unit_type='nivel')
        describe.entry_exit_gender(table, start_year, end_year, prof, descr_out_dir, entry=True, unit_type='nivel')

        # make table for mobility between appellate court regions
        describe.inter_unit_mobility_table(table, descr_out_dir, prof, 'ca cod')

        # make table for hierarchical mobility
        describe.hierarchical_mobility_table(table, descr_out_dir, prof)

        for unit_type in ['ca cod', 'nivel']:
            # make tables for entry and exit cohorts, per year per unit type
            describe.entry_exit_unit_table(table, start_year, end_year, prof, unit_type, descr_out_dir, entry=True)
            describe.entry_exit_unit_table(table, start_year, end_year, prof, unit_type, descr_out_dir, entry=False)
예제 #16
0
        description='End-to-end Speech Recognition')
    parser.add_argument('--dataset_path',
                        type=str,
                        default='SET YOUR KsponSpeech corpus PATH')
    parser.add_argument(
        '--new_path',
        type=str,
        default='SET YOUR path to store preprocessed KsponSpeech corpus')
    parser.add_argument('--labels_dest',
                        type=str,
                        default='SET YOUT path th store aihub_labels.csv file')
    parser.add_argument('--script_prefix',
                        type=str,
                        default='KsponScript_',
                        help='default: KsponScript_FILENUM.txt')
    parser.add_argument(
        '--mode',
        type=str,
        default='numeric',
        help='default: phonetic(6->"육"), optional: numeric(6->"6")')
    parser.add_argument('--filenum_adjust',
                        action='store_true',
                        default=False,
                        help='adjust file number for handling "%"')
    opt = parser.parse_args()

    preprocess(opt.dataset_path, opt.new_path, opt.mode, opt.filenum_adjust)
    create_char_labels(opt.opt.new_path, opt.labels_dest)
    create_script(opt.dataset_path, opt.new_path, opt.script_prefix)
    #gather_files(opt.dataset_path, opt.new_path, opt.script_prefix)
        if prof in {'judges', 'prosecutors'}:
            in_dir = root + trunks['dispersed'] + leaves[prof]['dispersed']['raw']
            scrape_log = root + trunks['dispersed'] + leaves[prof]['dispersed']['scrape log']
            scrape.update_db(in_dir, scrape_log, prof)

        # collect the data (which also does a first clean)
        in_dir = root + trunks['dispersed'] + leaves[prof]['dispersed']['raw']
        out_path = root + trunks['collected'] + leaves[prof]['collected']['file']
        make_table.make_pp_table(in_dir, out_path, prof)

        # preprocess the data (add variables, standardise names, assign unique IDs, etc.)
        in_dir = root + trunks['collected'] + leaves[prof]['collected']['dir']
        pop_out_path = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['population']
        std_log_path = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['standardise']
        pids_log_path = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['pids']
        preprocess.preprocess(in_dir, pop_out_path, std_log_path, pids_log_path, prof)

        # describe the data, i.e. generate tables of descriptive statistics for different samples
        pop_in_file = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['population']
        for sample in deets['samples']:
            # make directory tree for dumping the descriptives tables; NB: overwrites existing tree structure
            sample_out_dirs = {'totals': '', 'entry_exit': '', 'mobility': '', 'inheritance': ''}
            for d in sample_out_dirs:
                path_end = sample + '/' + d + '/'
                sample_out_dirs.update({d: root + trunks['descriptives'] + leaves[prof]['descriptives'] + path_end})
            [Path(d).mkdir(parents=True, exist_ok=True) for d in sample_out_dirs.values()]
            # generate the descriptives tables
            describe.describe(pop_in_file, sample, sample_out_dirs['totals'], sample_out_dirs['entry_exit'],
                              sample_out_dirs['mobility'], sample_out_dirs['inheritance'], prof,
                              deets['range'][0], deets['range'][1], deets['units'])
예제 #18
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jul  7 14:24:07 2017
Version 0.1 Finalized
@author: Bahram
"""
# this code gets CSV file and gets tweets and labels to feed a NB classifier for computing parameters if classifier
import pickle
import csv
import tokenizer
from trainer import Trainer
from operator import itemgetter
from classifier import Classifier
from preprocess.preprocess import preprocess # to get tokens, stem and remove stop words and negation detection.
tweetTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ['?!#%&']))#
process = preprocess()

def processing (training):
    # preprocess sentences for stemming, removing stop words and negation detection and apply
    trainingProcessed = list(map(process.gettokens, training))
    trainingProcessed = list(map(process.stemtokens, trainingProcessed))
    trainingProcessed = list(map(process.removestopwords, trainingProcessed))
    #print (trainingProcessed)
    trainingProcessed = list(map(process.negatesequence, trainingProcessed))
    trainingProcessedWords = []
    for sentence in trainingProcessed:
        sentence = ' '.join(sentence)
        wordsProcessed = sentence.split()
        for wordProcessed in wordsProcessed:
            trainingProcessedWords.append(wordProcessed)
    return trainingProcessedWords
예제 #19
0
"""
@github{
  title = {KsponSpeech.preprocess},
  author = {Soohwan Kim},
  publisher = {GitHub},
  url = {https://github.com/sooftware/KsponSpeech.preprocess},
  year = {2020}
}
"""
import argparse
from preprocess.preprocess import preprocess, create_char_labels, create_script


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='End-to-end Speech Recognition')
    parser.add_argument('--dataset_path', type=str, default='SET YOUR KsponSpeech corpus PATH')
    parser.add_argument('--script_prefix', type=str, default='KsponScript_', help='default: KsponScript_FILENUM.txt')
    opt = parser.parse_args()

    preprocess(opt.dataset_path)
    create_char_labels(opt.dataset_path)
    create_script(opt.dataset_path, opt.script_prefix)