Exemplo n.º 1
0
def save_vectors():
    if not MODEL_FILEPATH.exists():
        raise TrainedModelNotFoundError()

    write_lines([f"{len(model.words)} {len(model[model.words[0]])}"] + \
                [w + ' ' + ' '.join(map(str, model['a'])) for w in model.words],
                MODEL_FILEPATH.with_suffix('.vec'))
Exemplo n.º 2
0
def generate_for_each(macro_prefix, header_guard_prefix, supported_size,
                      nested_loops_count, output_dir):
    '''
    Generate FOR_EACH macros, one per required nested level. Each for loop
    will support a VA_ARGS_length up to the specified `supported_size`.
    '''
    lines = utils.generate_header_begin(macro_prefix, header_guard_prefix,
                                        'for_each')
    lines.append('')

    # Generate #includes for CAT, DEFER, INC macros
    lines.append('#include "{}arg.hpp"'.format(macro_prefix.lower()))
    lines.append('#include "{}cat.hpp"'.format(macro_prefix.lower()))
    lines.append('#include "{}inc.hpp"'.format(macro_prefix.lower()))
    lines.append('')

    # Generate multiple for each macros for nested loops
    for x in range(nested_loops_count):
        # Generate macro which invokes the correct iteration
        lines += generate_for_each_dispatch_macro(macro_prefix, x)
        lines.append('')
        # Generate the ITERN macros.
        for i in range(supported_size, 1, -1):
            lines += generate_for_each_itern_macro(macro_prefix, x, i)
        # Generate the ITER1 macro.
        lines += generate_for_each_iter1_macro(macro_prefix, x)

        lines.append('')

    lines.append(utils.generate_header_end())
    utils.write_lines(
        utils.get_output_file_name(macro_prefix, 'for_each', output_dir),
        lines)
Exemplo n.º 3
0
def _write_ft_file(filename,
                   text_key=TEXT_KEY,
                   label_key=LABEL_KEY,
                   context_key=CONTEXT_KEY,
                   remove_empty=True,
                   clean=True,
                   relevant_labels=None,
                   label_prefix=LABEL_PREFIX,
                   context_prefix=CONTEXT_PREFIX):
    data = _load_data(filename, text_key=text_key, remove_empty=remove_empty)

    if clean:
        data = _clean_texts(data, text_key=text_key)
        data = _clean_labels(data,
                             label_key=label_key,
                             relevant_labels=relevant_labels)
        data = _clean_contexts(data,
                               context_key=context_key,
                               context_prefix=context_prefix)

    def add_prefix(label):
        return f"{label_prefix}{label if len(label) > 0 else 'unknown'}"

    def make_row(label, text, context):
        row = [add_prefix(label)] + ([context] if context else []) + [text]
        return ' '.join(row).strip()

    TRAIN_FT_FILEPATH.parent.mkdir(parents=True, exist_ok=True)

    texts = [
        make_row(row[label_key],
                 row[context_key] if context_key in row.keys() else '',
                 row[text_key]) for row in data
    ]
    write_lines(texts, TRAIN_FT_FILEPATH)
Exemplo n.º 4
0
def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "segmented"))
    filenames = [n for n in filenames if n.endswith(".txt")]
    filenames.sort()

    utils.mkdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))

    for filename in pyprind.prog_bar(filenames):
        path_seg = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "segmented", filename)
        path_raw = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "raw", filename)
        path_dst = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "preprocessed",
                                filename.replace(".txt", ".edus"))
        # Input
        edus = utils.read_lines(path_seg, process=lambda line: line)
        edus = remove_empty_lines(filename, edus)
        raw_lines = utils.read_lines(path_raw, process=lambda line: line)
        raw_lines = remove_empty_lines(filename, raw_lines)
        assert count_chars(edus) == count_chars(raw_lines)
        # Processing
        edus = convert_edus(edus, raw_lines)
        assert count_chars(edus) == count_chars(raw_lines)
        # Output
        utils.write_lines(path_dst, edus)
Exemplo n.º 5
0
def srt2ss(path, is_ass=True):

    timer = utils.Timer()
    timer.start()

    subs = utils.load_sub_file(path)

    start_time = utils.get_start_time(subs, 'ass')
    end_time = utils.get_end_time(subs, 'ass')
    plaintext = utils.get_plaintext(subs)

    sub_block = []
    LAYER = 0
    STYLE = 'Default'
    NAME = ''
    MARGINL = 0
    MARGINV = 0
    EFFECT = ''
    for i in range(len(subs)):
        sub_block.append('Dialogue: %d, %s, %s, %s, %s, %d, %d, %s, %s' %
                         (LAYER, start_time[i], end_time[i], STYLE, NAME,
                          MARGINL, MARGINV, EFFECT, plaintext[i]))
    utils.write_txt('%s.ass' % (output_filename), script_info())
    utils.write_lines('%s.ass' % (output_filename), sub_block, mode='a')

    timer.stop()

    print('转换完成,用时%.2f秒' % (timer.elapsed))
Exemplo n.º 6
0
 def pre_process_data(raw_data, tokenizer, config, logger):
     '''
     raw_data: dir or a specific file
     '''
     vocab_file = os.path.join(config.tokenized_data_dir, 'vocab.txt')
     sample_file = os.path.join(config.tokenized_data_dir, 'samples.txt')
     if os.path.isfile(vocab_file) and os.path.isfile(sample_file):
         logger.info("vocab file and sample file already existed!")
         return Data(vocab_file, sample_file, config, logger)
     else:
         logger.info("Genarate vocabulary and tokenized samples.")
         if os.path.isfile(raw_data):
             raw_data = [raw_data]
         else:
             raw_data = glob.glob(os.path.join(raw_data, '*'))
         samples = set()
         for file in raw_data:
             for qa in parse_raw_file(file):
                 q = qa[0]
                 a = qa[1]
                 tokenized_q = tokenize_one_line(
                     sentence=q,
                     cut_fun=tokenizer.tokenize,
                     specical_symbol=config.special_symbol,
                     mode=config.source_language_type,
                     lower=config.source_language_lower)
                 tokenized_a = tokenize_one_line(
                     sentence=a,
                     cut_fun=tokenizer.tokenize,
                     specical_symbol=config.special_symbol,
                     mode=config.target_language_type,
                     lower=config.target_language_lower)
                 samples.add(tokenized_q + "\t" + tokenized_a)
         logger.info('sample size:{}'.format(len(samples)))
         logger.info("save samples in '{}'".format(sample_file))
         write_lines(sample_file, samples)
         source_vocab, target_vocab, special_vocab = create_vocabulary(
             samples, config.special_symbol)
         source_vocab = set(list(source_vocab.keys()))
         for s_symbol in config.vocab_remains:
             if s_symbol in source_vocab:
                 source_vocab.discard(s_symbol)
             if s_symbol in target_vocab:
                 target_vocab.discard(s_symbol)
             if s_symbol in special_vocab:
                 special_vocab.discard(s_symbol)
         logger.info('vocab size:{}'.format(
             len(source_vocab) + len(target_vocab) + len(special_vocab) +
             len(config.vocab_remains)))
         logger.info('save vocabulary in "{}"'.format(vocab_file))
         with open(vocab_file, 'w', encoding='utf8') as f:
             for line in config.vocab_remains:
                 f.write(line + '\n')
             for line in special_vocab:
                 f.write(line + '\n')
             for line in source_vocab | target_vocab:
                 f.write(line + '\n')
         return Data(vocab_file, sample_file, config, logger)
def hit_keyword(fn):
    keywords = []
    f = open(fn)
    while True:
        line = f.readline()
        if not line:
            break
        keywords.append(line.strip("\n"))
        line = f.readline()
        line = f.readline()
    from utils import write_lines
    write_lines("./data/query_hit_keyword.dat", keywords)
    f.close()
def hit_keyword(fn):
    keywords = []
    f = open(fn)
    while True:
        line = f.readline()
        if not line:
            break
        keywords.append(line.strip("\n"))
        line = f.readline()
        line = f.readline()
    from utils import write_lines
    write_lines("./data/query_hit_keyword.dat",keywords)
    f.close()
Exemplo n.º 9
0
def generate_cat(macro_prefix, header_prefix, supported_size, output_dir):
    '''
    Generate CAT macro which concatenate its two arguments.
    '''
    lines = utils.generate_header_begin(macro_prefix, header_prefix, 'cat')
    lines.append('')

    lines += utils.get_cat_lines(macro_prefix, '', supported_size)

    lines.append('')
    lines.append(utils.generate_header_end())
    utils.write_lines(
        utils.get_output_file_name(macro_prefix, 'cat', output_dir), lines)
Exemplo n.º 10
0
def main():
    config = utils.Config()

    path_out = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt")
    utils.mkdir(path_out)
    utils.mkdir(os.path.join(path_out, "raw"))

    sections = os.listdir(config.getpath("ptbwsj"))
    sections.sort()
    rstdt_wsj_filenames = get_rstdt_wsj_filenames()
    count = 0
    for sec_i, section in enumerate(sections):
        print("[%d/%d] Processing %s" % \
                (sec_i+1, len(sections),
                 os.path.join(config.getpath("ptbwsj"), section)))

        filenames = os.listdir(os.path.join(config.getpath("ptbwsj"), section))
        filenames = [n for n in filenames if n.startswith("wsj_")]
        filenames.sort()
        for filename in filenames:
            if filename in rstdt_wsj_filenames:
                print("Skipped %s (which is contained in RST-DT)" % filename)
                continue
            count += 1

            try:
                lines = utils.read_lines(os.path.join(config.getpath("ptbwsj"),
                                                      section, filename),
                                         process=lambda line: line)
            except UnicodeDecodeError:
                lines = []
                for line in codecs.open(
                        os.path.join(config.getpath("ptbwsj"), section,
                                     filename), "r", "latin-1"):
                    line = line.strip()
                    lines.append(line)
            assert lines[0] == ".START"
            lines = lines[1:]
            top_empty_count = 0
            for line_i in range(len(lines)):
                if lines[line_i] == "":
                    top_empty_count += 1
                else:
                    break
            lines = lines[top_empty_count:]
            utils.write_lines(
                os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "raw",
                             filename + ".txt"), lines)

    print("Processed %d files." % count)
Exemplo n.º 11
0
def generate_arg(macro_prefix, header_guard_prefix, supported_size,
                 output_dir):
    '''
    Generate the ARG_LENGTH macro.
    '''
    # Seed lines with header guard.
    lines = utils.generate_header_begin(macro_prefix, header_guard_prefix,
                                        'arg')
    lines.append('')

    # Generate ARG_NTH, where N = `supported_size` + 1, giving back the Nth
    # argument in the variable arguments.
    args_list = ['_{}'.format(i) for i in range(1, supported_size + 1)]
    args_str = ', '.join(args_list)
    arg_nth = ARG_NTH_TEMPLATE.format(macro_prefix.upper(), supported_size + 1,
                                      args_str)
    lines.append(arg_nth)
    lines.append('')

    # Generate ARG_LENGTH for getting the length of the variable arguments.
    # Works for lists from 1 to `supported_size`.
    lengths_list = ['{}'.format(i) for i in range(supported_size, 0, -1)]
    lengths_str = ', '.join(lengths_list)
    arg_length = ARG_LENGTH_TEMPLATE.format(macro_prefix.upper(),
                                            supported_size + 1, lengths_str)
    lines.append(arg_length)
    lines.append('')

    # Generate macro for getting the second of the variable arguments.
    lines.append(ARG_2ND_TEMPLATE.format(macro_prefix))
    lines.append('')

    # Generate CAT for use in ARG macros.
    lines += utils.get_cat_lines(macro_prefix, 'ARG_', 2)
    lines.append('')

    # Generate ARG_IS_SINGLE for checking if the variable arguments is of size
    # 1.
    lines.append(ARG_IS_SINGLE_TEMPLATE.format(macro_prefix.upper()))
    lines.append('')

    # Generate ARG_SINGULAR for checking if de-paranthesized argument is
    # a list of size 1.
    lines.append(ARG_IS_SINGULAR_TEMPLATE.format(macro_prefix.upper()))

    lines.append('')
    lines.append(utils.generate_header_end())
    utils.write_lines(
        utils.get_output_file_name(macro_prefix, 'arg', output_dir), lines)
Exemplo n.º 12
0
def ss2srt(path, chinese_only=False, english_only=False, Tchinese_only=False):

    timer = utils.Timer()
    timer.start()

    subs = utils.load_sub_file(path)

    start_time = utils.get_start_time(subs, 'srt')
    end_time = utils.get_end_time(subs, 'srt')
    plaintext = utils.get_plaintext(subs)
    format_sub = []

    if chinese_only == True:
        pass
    elif english_only == True:
        pass
    elif Tchinese_only == True:
        pass
    else:
        # 简体&英文双语
        for i in range(len(subs)):
            format_sub.append('%s\n' % (i + 1))
            if start_time[i] == start_time[i -
                                           1] and end_time[i] == end_time[i -
                                                                          1]:
                format_sub.append('%s --> %s\n' % (start_time[i], end_time[i]))
                format_sub.append('%s' % (plaintext[i - 1]))
                format_sub.append('%s\n' % (plaintext[i]))
            elif start_time[i] == start_time[i +
                                             1] and end_time[i] == end_time[i +
                                                                            1]:
                format_sub.remove(format_sub[-1])
                pass
            else:
                format_sub.append('%s --> %s\n' % (start_time[i], end_time[i]))
                format_sub.append('%s\n' % (plaintext[i]))

    utils.write_lines('%s.srt' % ("test"), format_sub)

    timer.stop()

    print('转换完成,用时%.2f秒' % (timer.elapsed))
Exemplo n.º 13
0
def generate_inc(macro_prefix, header_guard_prefix, supported_size,
                 output_dir):
    '''
    Generate INC macros which gives a number one higher than the invoked INC.
    '''
    lines = utils.generate_header_begin(macro_prefix, header_guard_prefix,
                                        'inc')
    lines.append('')
    lines.append('// INC_N gives back N+1')

    # Generate INC_N macros, which gives back N + 1.
    inc_macros = [
        INC_TEMPLATE.format(macro_prefix.upper(), i, i + 1)
        for i in range(supported_size)
    ]
    lines += inc_macros

    lines.append('')
    lines.append(utils.generate_header_end())
    utils.write_lines(
        utils.get_output_file_name(macro_prefix, 'inc', output_dir), lines)
Exemplo n.º 14
0
def extract_plain_text(path, english_only=False, chinese_only=False):

    timer = utils.Timer()
    timer.start()

    subs = utils.load_sub_file(path)
    plaintext = utils.get_plaintext(subs)

    if english_only and chinese_only == True:
        print(
            '仅保留中文和仅保留英文不能同时勾选\nChinese only and English only cannot be checked at the same time'
        )
        sys.exit(0)

    elif chinese_only:
        chinese_lines = []
        for i in range(len(plaintext)):
            chinese_lines.append(utils.chinese_only(plaintext[i]) + '\n')
        utils.write_lines('%s.txt' % (output_file_name), chinese_lines)

    elif english_only:
        english_lines = []
        for i in range(len(plaintext)):
            english_lines.append(utils.english_only(plaintext[i]) + '\n')
        utils.write_lines('%s.txt' % (output_file_name), english_lines)

    else:
        utils.write_lines('%s.txt' % (output_file_name), plaintext)

    timer.stop()

    print('提取完成,用时%.2f秒' % (timer.elapsed))
Exemplo n.º 15
0
def main(*args):
    assert len(args) >= 2

    word_embeddings = np.load("embedding/word_embeddings.npy")
    position_embeddings_1 = np.load("embedding/position_embeddings_1.npy")
    position_embeddings_2 = np.load("embedding/position_embeddings_2.npy")
    embeddings = make_dict(word_embeddings, position_embeddings_1, position_embeddings_2)

    from models import build_model
    model = build_model(embeddings)
    weights_path = args[0]
    model.load_weights(weights_path)

    dis2idx_1 = json_load("embedding/dis2idx_1.json")
    dis2idx_2 = json_load("embedding/dis2idx_2.json")
    word2idx = json_load("embedding/word2idx.json")
    encoder = Encoder(word2idx, dis2idx_1, dis2idx_2)

    input_file = args[1]
    sentences, y = read_input(input_file)
    data = list(map(list, zip(*[s.generate_features(encoder) for s in sentences])))

    scores = model.predict(data, verbose=False)
    predictions = scores.argmax(-1)
    idx2relation = read_relations("origin_data/relations.txt")
    outputs = ["{} {}".format(prediction, idx2relation[prediction]) for prediction in predictions]

    print("\n".join(outputs))

    timestamp = int(datetime.now().timestamp())
    output_folder = "output/test/%d" % timestamp
    os.makedirs(output_folder, exist_ok=True)
    print("output folder: %s" % output_folder)
    output_file = os.path.join(output_folder, 'output.txt')
    error_list_file = os.path.join(output_folder, 'error_list.txt')
    error_predictions_file = os.path.join(output_folder, 'error_predictions.txt')

    write_lines(output_file, outputs)

    error_list = []
    error_predictions = []
    for sentence, label, prediction in zip(sentences, y, predictions):
        if label != prediction:
            error_list.append('{} {}'.format(label, str(sentence)))
            error_predictions.append('{} {}'.format(prediction, idx2relation[prediction]))

    write_lines(error_list_file, error_list)
    write_lines(error_predictions_file, error_predictions)
Exemplo n.º 16
0
    def random_login(self):
        tsp = utils.random_timestamp()
        outlier = random.random() < OUTLIERS_RATE
        row = (self.uid, tsp, self.random_country(tsp, outlier))
        return outlier, row

    def random_country(self, tsp, outlier):
        pass

class Sedentary(User):
    def random_country(self, _, outlier):
        return self.countries[outlier]

class BusinessTraveler(User):
    def random_country(self, tsp, outlier):
        return self.countries[outlier ^ utils.isweekend(tsp)]

class FrequentFlyer(User):
    def __init__(self, userid):
        super().__init__(userid)
        self.has_outliers = False

    def random_country(self, tsp, outlier):
        return random.choice(self.countries)

OUTLIERS_RATE = 0.05

for uid, init in enumerate((Sedentary, BusinessTraveler, FrequentFlyer)):
    user = init(uid)
    utils.write_lines("logins{}".format(user.uid), 500, user.random_login, user.has_outliers)
Exemplo n.º 17
0
    def random_country(self, tsp, outlier):
        pass


class Sedentary(User):
    def random_country(self, _, outlier):
        return self.countries[outlier]


class BusinessTraveler(User):
    def random_country(self, tsp, outlier):
        return self.countries[outlier ^ utils.isweekend(tsp)]


class FrequentFlyer(User):
    def __init__(self, userid):
        super().__init__(userid)
        self.has_outliers = False

    def random_country(self, tsp, outlier):
        return random.choice(self.countries)


OUTLIERS_RATE = 0.05

for uid, init in enumerate((Sedentary, BusinessTraveler, FrequentFlyer)):
    user = init(uid)
    utils.write_lines("logins{}".format(user.uid), 500, user.random_login,
                      user.has_outliers)
Exemplo n.º 18
0
if is_(opts.file):
    queries.extend(get_words(opts.file, i=opts.start, j=opts.stop))
if is_(opts.query):
    queries.extend(opts.query)

for i, q in enumerate(queries):
    LOGGER.info('+++ QUERY #%s: %s +++\n' % (i, q))

    RESULT_DIR = osp.join(RESULT_PREFIX, q)
    mkdir_p(RESULT_DIR)

    if opts.load_urls:
        urls = read_lines(osp.join(opts.load_urls, q, 'urls.txt'))
    elif opts.sch:
        urls = image_search(q, opts.target)
        write_lines(urls, osp.join(RESULT_DIR, 'urls.txt'))

    if opts.load_preds:
        preds = read_lines(osp.join(opts.load_preds, q, 'preds.txt'))
    elif opts.rsch:
        preds = reverse_search_urls(q,
                                    *urls,
                                    lang=opts.target,
                                    n_img=opts.n_img)
        write_lines(preds, osp.join(RESULT_DIR, 'preds.txt'))

    # TODO
    # if opts.pred:
    #     for top_n in 1, 3, 5, 10, 20, 25:
    #         for use_lang in True, False:
    #             pred_filtered = filter_results(preds, q, lang=opts.lang)
Exemplo n.º 19
0
def main(args):
    assert args.inside_rstdt ^ args.outside_rstdt

    config = utils.Config()

    nlp = spacy.load("en_core_web_sm",
                     disable=["tagger", "parser", "ner", "textcat"])

    # Collect file names in RST-DT
    rstdt_train_filenames = os.listdir(
        os.path.join(config.getpath("data"), "rstdt", "wsj", "train"))
    rstdt_test_filenames = os.listdir(
        os.path.join(config.getpath("data"), "rstdt", "wsj", "test"))
    rstdt_train_filenames = [
        n for n in rstdt_train_filenames if n.endswith(".edus.tokens")
    ]
    rstdt_test_filenames = [
        n for n in rstdt_test_filenames if n.endswith(".edus.tokens")
    ]
    rstdt_train_filenames = [
        n[:-len(".edus.tokens")] for n in rstdt_train_filenames
    ]
    rstdt_test_filenames = [
        n[:-len(".edus.tokens")] for n in rstdt_test_filenames
    ]
    assert len(rstdt_train_filenames) == 347
    assert len(rstdt_test_filenames) == 38

    if args.outside_rstdt:
        # Prepare the target directory: /path/to/data/ptbwsj_wo_rstdt
        utils.mkdir(os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt"))

    sections = os.listdir(config.getpath("ptbwsj"))
    sections.sort()
    count = 0
    for section in pyprind.prog_bar(sections):
        # File names of articles in PTB-WSJ
        filenames = os.listdir(os.path.join(config.getpath("ptbwsj"), section))
        filenames = [n for n in filenames if n.startswith("wsj_")]
        filenames.sort()

        for filename in filenames:
            # Read text of the article
            try:
                lines = utils.read_lines(os.path.join(config.getpath("ptbwsj"),
                                                      section, filename),
                                         process=lambda line: line)
            except UnicodeDecodeError:
                lines = []
                for line in codecs.open(
                        os.path.join(config.getpath("ptbwsj"), section,
                                     filename), "r", "latin-1"):
                    line = line.strip()
                    lines.append(line)

            # Remove the ".START" markers
            assert lines[0] == ".START"
            lines = lines[1:]
            for i in range(len(lines)):
                lines[i] = lines[i].replace(".START", "")
                lines[i] = " ".join(lines[i].split())

            # Remove the beginning empty lines
            top_empty_count = 0
            for line_i in range(len(lines)):
                if lines[line_i] == "":
                    top_empty_count += 1
                else:
                    break
            lines = lines[top_empty_count:]

            # Tokenization
            tokenized_lines = []
            for line in lines:
                if line == "":
                    tokens = ""
                else:
                    doc = nlp(line)
                    tokens = [token.text for token in doc]
                    tokens = " ".join(tokens)
                tokenized_lines.append(tokens)

            if args.inside_rstdt:
                if filename in rstdt_train_filenames:
                    # File inside RST-DT training set
                    utils.write_lines(
                        os.path.join(config.getpath("data"), "rstdt", "wsj",
                                     "train", filename + ".doc.tokens"),
                        tokenized_lines)
                    count += 1
                elif filename in rstdt_test_filenames:
                    # File inside RST-DT test set
                    utils.write_lines(
                        os.path.join(config.getpath("data"), "rstdt", "wsj",
                                     "test", filename + ".doc.tokens"),
                        tokenized_lines)
                    count += 1
                else:
                    continue
            else:
                if filename in rstdt_train_filenames:
                    continue
                elif filename in rstdt_test_filenames:
                    continue
                else:
                    # File outside RST-DT
                    utils.write_lines(
                        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                     filename + ".doc.tokens"),
                        tokenized_lines)
                    count += 1

    print("Processed %d files." % count)