Пример #1
0
def output_radio_data_by_day_multi_month(
        data_file_1='/Users/ngillani/data/radio/2018_08_single_callsign_show_data.json',
        data_file_2='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json',
        start_date=datetime.date(2018, 8, 15),
        end_date=datetime.date(2018, 9, 15),
        output_file='data/radio_data_by_day_mid_aug_mid_sept.csv'):

    radio_data = [read_dict(data_file_1), read_dict(data_file_2)]
    cities_to_ids = {}

    all_data = {'id': [], 'text': []}
    ref_date = datetime.date(2018, 8, 15)

    all_days = set()
    for j in range(0, len(radio_data)):
        for i, s in enumerate(radio_data[j]):
            print j, i
            # if i == 100: break
            curr_date = datetime.datetime.utcfromtimestamp(
                s['segment_start_global']).date()

            if curr_date < start_date or curr_date > end_date: continue

            day_id = (curr_date - ref_date).days
            all_days.add(day_id)
            all_data['id'].append(day_id)
            all_data['text'].append(s['denorm_content'])

    print all_days

    df = pd.DataFrame(data=all_data)
    df.to_csv(output_file, index=False)
Пример #2
0
def get_image_files(image_dir, check=False):
    t = time.time()
    chinese_dict = read_dict(FLAGS.dict_text)
    words = list(chinese_dict.keys())
    count = 0
    image_tupe = []
    for f in os.listdir(image_dir):
        try:
            if not f.endswith(('.gif', '.jpg', '.png')):
                continue
            fp = os.path.join(image_dir, f)
            if not os.path.isabs(fp):
                fp = os.path.abspath(fp)
            if not os.path.exists(fp):
                continue
            if check:
                Image.open(fp)
                #cv2.imread(fp)
            label = f.split('_')[1]
            if is_valid_char(label, words):
                os.remove(fp)
                continue
            if len(label) == 0:
                os.remove(fp)
                continue
            image_tupe.append((fp, label))
            count += 1
        except Exception as e:
            print("fn:%s,error: %s", fp, e)
            os.remove(fp)
    te = time.time() - t
    print("cost time:%f, count:%d" % (te, len(image_tupe)))
    return image_tupe
Пример #3
0
def train_translation_matrix(source_file, target_file, dict_file, out_file):
    """Trains a transltion matrix between the source and target languages, using the words in dict_file as anchor
    points and writing the translation matrix to out_file

    Note that the source language file and target language file must be in the word2vec C ASCII format

    :param source_file: The name of the source language file
    :param target_file: The name of the target language file
    :param dict_file: The name of the file with the bilingual dictionary
    :param out_file: The name of the file to write the translation matrix to
    """
    log.info("Reading the training data")
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    log.info("Reading: %s" % source_file)
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    log.info("Reading: %s" % target_file)
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    log.debug('Words in the source space: %s' % source_sp.row2id)
    log.debug('Words in the target space: %s' % target_sp.row2id)

    log.info("Learning the translation matrix")
    log.info("Training data: %s" % str(train_data))
    tm = train_tm(source_sp, target_sp, train_data)

    log.info("Printing the translation matrix")
    np.savetxt(out_file, tm)
Пример #4
0
def output_radio_data_by_geo(
        data_file='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json',
        output_file='data/radio_data_by_city_and_state.csv',
        output_mapping_file='data/radio_by_city_and_state_mapping.json'):

    radio_data = read_dict(data_file)
    cities_to_ids = {}

    all_data = {'id': [], 'text': []}

    for i, s in enumerate(radio_data):
        print i
        geo = s['city'] + ', ' + s['state']
        if not geo in cities_to_ids:
            cities_to_ids[geo] = len(cities_to_ids)

        all_data['id'].append(cities_to_ids[geo])
        all_data['text'].append(s['denorm_content'])

    df = pd.DataFrame(data=all_data)
    df.to_csv(output_file, index=False)

    f = open(output_mapping_file, 'w')
    f.write(json.dumps(cities_to_ids, indent=4))
    f.close()
Пример #5
0
def output_radio_data_by_day(
        data_file='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json',
        output_file='data/radio_data_by_day.csv'):

    radio_data = read_dict(data_file)
    cities_to_ids = {}

    all_data = {'id': [], 'text': []}
    ref_date = datetime.date(2018, 9, 1)

    all_days = set()
    for i, s in enumerate(radio_data):
        print i
        # if i == 100: break
        curr_date = datetime.datetime.utcfromtimestamp(
            s['segment_start_global']).date()

        day_id = (curr_date - ref_date).days
        all_days.add(day_id)
        all_data['id'].append(day_id)
        all_data['text'].append(s['denorm_content'])

    print all_days

    df = pd.DataFrame(data=all_data)
    df.to_csv(output_file, index=False)
Пример #6
0
def get_image_files2(image_dir, check=False):
    t = time.time()
    im_names = []  # glob.glob(os.path.join(image_dir, '*.{jpg,png,gif}'))
    for ext in ('*.png', '*.jpg', '*.gif'):
        im_names.extend(glob.glob(os.path.join(image_dir, ext)))
    chinese_dict = read_dict(FLAGS.dict_text)
    words = list(chinese_dict.keys())
    count = 0
    image_tupe = []
    for im_name in im_names:
        try:
            if not os.path.exists(im_name):
                continue
            if check:
                Image.open(im_name)
                # cv2.imread(fp)
            label = im_name.split('_')[1]
            if is_valid_char(label, words):
                os.remove(im_name)
                continue
            if len(label) == 0:
                os.remove(im_name)
                continue
            image_tupe.append((im_name, label))
            count += 1
        except Exception as e:
            print("fn:%s,error: %s", im_name, e)
            os.remove(im_name)
    te = time.time() - t
    print("cost time:%f, count:%d" % (te, len(image_tupe)))
    return image_tupe
Пример #7
0
def train_wrapper(seed_fn, source_fn, target_fn, reverse=False, mx_path=None,
                  train_size=5000):
    logging.info("Training...")
    seed_trans = read_dict(seed_fn, reverse=reverse)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words = set(seed_trans.iterkeys())
    target_words = set().union(*seed_trans.itervalues())

    source_sp = Space.build(source_fn, lexicon=source_words)
    source_sp.normalize()

    target_sp = Space.build(target_fn, lexicon=target_words)
    target_sp.normalize()

    logging.info("Learning the translation matrix")
    tm, used_for_train = train_tm(source_sp, target_sp, seed_trans, train_size)

    mx_path = default_output_fn(mx_path, seed_fn, source_fn, target_fn,)
    logging.info("Saving the translation matrix to {}".format(mx_path))
    np.save('{}.npy'.format(mx_path), tm)
    pickle.dump(used_for_train, open('{}.train_wds'.format(mx_path),
                                     mode='w'))

    return tm, used_for_train
Пример #8
0
def train_translation_matrix(source_file, target_file, dict_file, out_file):
    """Trains a transltion matrix between the source and target languages, using the words in dict_file as anchor
    points and writing the translation matrix to out_file

    Note that the source language file and target language file must be in the word2vec C ASCII format

    :param source_file: The name of the source language file
    :param target_file: The name of the target language file
    :param dict_file: The name of the file with the bilingual dictionary
    :param out_file: The name of the file to write the translation matrix to
    """
    log.info("Reading the training data")
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    log.info("Reading: %s" % source_file)
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    log.info("Reading: %s" % target_file)
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    log.debug('Words in the source space: %s' % source_sp.row2id)
    log.debug('Words in the target space: %s' % target_sp.row2id)

    log.info("Learning the translation matrix")
    log.info("Training data: %s" % str(train_data))
    tm = train_tm(source_sp, target_sp, train_data)

    log.info("Printing the translation matrix")
    np.savetxt(out_file, tm)
Пример #9
0
 def _read_token(self):
     token_file = os.path.join(utils.get_user_home(), TOKEN_FILENAME)
     if os.path.exists(token_file):
         res = utils.read_dict(token_file)
         if res:
             self.id = res.get('douban_user_id')
             self.tk = res.get('access_token')
             return self.is_authorized()
Пример #10
0
def output_radio_data_by_day_multi_month_for_continuous_context(
        data_file_1='/Users/ngillani/data/radio/2018_08_single_callsign_show_data.json',
        data_file_2='/Users/ngillani/data/radio/2018_09_single_callsign_show_data.json',
        knot_size=3,
        start_date=datetime.date(2018, 8, 15),
        end_date=datetime.date(2018, 9, 15),
        output_file='data/radio_data_by_day_mid_aug_mid_sept_continuous.csv'):

    radio_data = [read_dict(data_file_1), read_dict(data_file_2)]
    # radio_data = [read_dict(data_file_1)]
    cities_to_ids = {}

    all_data = {'id': [], 'attr': [], 'standardized_attr': [], 'text': []}
    ref_date = datetime.date(2018, 8, 15)

    all_days = set()
    for j in range(0, len(radio_data)):
        for i, s in enumerate(radio_data[j]):
            print j, i
            # if i == 10000: break
            curr_date = datetime.datetime.utcfromtimestamp(
                s['segment_start_global']).date()

            if curr_date < start_date or curr_date > end_date: continue

            day_id = (curr_date - ref_date).days
            all_days.add(day_id)
            all_data['attr'].append(day_id)
            all_data['text'].append(s['denorm_content'])

    all_days = list(all_days)
    all_days_mean = np.mean(all_days)
    all_days_std = np.std(all_days)
    knots = list(
        range(np.min(all_days),
              np.max(all_days) + knot_size, knot_size))
    for d in all_data['attr']:
        all_data['standardized_attr'].append(
            float(d - all_days_mean) / all_days_std)
        all_data['id'].append(get_knot_id(knots, d))

    df = pd.DataFrame(data=all_data)
    df.to_csv(output_file, index=False)
    print knots
Пример #11
0
def api_lookup():
    n = int(request.args.get('n', 5))
    assert n > 0, 'No pronunciations requested'

    voice = request.args.get('voice', None)

    profile = request_to_profile(request, profiles_dirs)
    ps_config = profile.speech_to_text['pocketsphinx']
    espeak_config = profile.text_to_speech['espeak']

    word = request.data.decode('utf-8').strip().lower()
    assert len(word) > 0, 'No word to look up'
    logging.debug('Getting pronunciations for %s' % word)

    # Load base and custom dictionaries
    base_dictionary_path = profile.read_path(ps_config['base_dictionary'])
    custom_path = profile.read_path(ps_config['custom_words'])

    word_dict = {}
    for word_dict_path in [base_dictionary_path, custom_path]:
        if os.path.exists(word_dict_path):
            with open(word_dict_path, 'r') as dictionary_file:
                utils.read_dict(dictionary_file, word_dict)

    result = utils.lookup_word(word, word_dict, profile, n=n)

    # Get phonemes from eSpeak
    espeak_command = ['espeak', '-q', '-x']

    if voice is None:
        if 'voice' in espeak_config:
            # Use profile voice
            voice = espeak_config['voice']
        elif 'language' in profile.json:
            # Use language default voice
            voice = profile.json['language']

    espeak_command.extend(['-v', voice, word])
    logging.debug(espeak_command)
    result['espeak_phonemes'] = subprocess.check_output(
        espeak_command).decode()

    return jsonify(result)
Пример #12
0
def main(sys_argv):

    try:
        opts, argv = getopt.getopt(sys_argv[1:], "ho:", ["help", "output="])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(1)

    out_file = "./tm"
    for opt, val in opts:
        if opt in ("-o", "--output"):
            out_file = val
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            usage(1)

    if len(argv) == 3:
        source_file = argv[1]
        target_file = argv[2]
        dict_file = argv[0]
    else:
        print(str(err))
        usage(1)

    print("Reading the training data")
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print("Reading: %s" % source_file)
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print("Reading: %s" % target_file)
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print("Learning the translation matrix")
    tm = train_tm(source_sp, target_sp, train_data)

    print("Printing the translation matrix")
    np.savetxt("%s.txt" % out_file, tm)
Пример #13
0
    def __init__(self, root, dictionary_file):
        """
        Contains the game logic variable initialisation.
        root -- the root window,
        dictionary_file -- the path to the file which contains the word dictionary.
        """

        self.root = root
        self.gameBoard = ['-' * GRID_SIZE] * GRID_SIZE
        self.gameBoard[GRID_SIZE // 2] = '-' * (GRID_SIZE // 2) + '0' + '-' * (GRID_SIZE // 2)
        self.lg = LetterGenerator()
        self.sm = ScoreManager()
        self.dictionary = utils.read_dict(dictionary_file)

        self.active_player = 0

        self.player1 = {'name': "Player1", 'letters': self.lg.draw(MAX_HAND_SIZE), 'score': 0}
        self.player2 = {'name': "Player2", 'letters': self.lg.draw(MAX_HAND_SIZE), 'score': 0}
Пример #14
0
            plt.show()
        #关闭
        coord.request_stop()
        coord.join(threads)


def write_dict():
    cs = open("resource/gb2312_list.txt", 'r').read()
    index = 134
    with open("resource/new_dic2.txt", 'a') as f:
        for c in cs:
            f.write("%d\t%c\n" % (index, c))
            index = index + 1


#python gen_record.py --dataset_name=train --dataset_dir=out --dataset_nums=1024 --output_dir=datasets/train
if __name__ == '__main__':
    chinese_dict = read_dict(FLAGS.dict_text)
    # make_tfrecord2(chinese_dict, FLAGS.dataset_name, FLAGS.dataset_nums)

    # write_dict()
    # words = open("resource/gb2312_list.txt", 'r').read()
    # print(words)

    parse_tfrecord_file()
    #
    # import datasets

    # print(getattr(datasets, "my_data"))

    pass
Пример #15
0
    if len(argv) == 4:
        tm_file = argv[0]
        test_file = argv[1]
        source_file = argv[2]
        target_file = argv[3]

    else:
        print str(err)
        usage(1)

    print "Loading the translation matrix"
    tm = np.loadtxt(tm_file)

    print "Reading the test data"
    test_data = read_dict(test_file)

    # in the _source_ space, we only need to load vectors for the words in test.
    # semantic spaces may contain additional words, ALL words in the _target_
    # space are used as the search space
    source_words, _ = zip(*test_data)
    source_words = set(source_words)

    print "Reading: %s" % source_file
    if not additional:
        source_sp = Space.build(source_file, source_words)
    else:
        # read all the words in the space
        lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
                                 comments=None, usecols=(0,)).flatten())
        # the max number of additional+test elements is bounded by the size
Пример #16
0
            out_file = val
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            usage(1)

    if len(argv) == 3:
        source_file = argv[1]
        target_file = argv[2]
        dict_file = argv[0]
    else:
        print str(err)
        usage(1)

    print "Reading the training data"
    train_data = read_dict(dict_file)
    print train_data
    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print "Reading: %s" % source_file
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print "Learning the translation matrix"
    tm = train_tm(source_sp, target_sp, train_data)
Пример #17
0
    args['train_data'] = 'data/cw'  # 训练数据路径
    args['test_data'] = 'data/cw'  # 测试数据路径
    args['batch_size'] = 64  # 每一批用来训练的样本数
    args['epoch'] = 10  # 迭代次数
    args['hidden_dim'] = 100  # lstm cell输出的数据的维度
    args['optimizer'] = 'Adam'  # 优化损失函数的方法
    args['lr'] = 0.001  # 学习率
    args['clip'] = 5.0  # 限定梯度更新的时候的阈值
    args['dropout'] = 0.5  # 保留率
    args[
        'update_embedding'] = True  # 是否要对embedding进行更新,embedding初始化之后,这里设置成更新,就可以更新embedding
    args['embedding_dim'] = 100  # embedding的维度
    args['shuffle'] = True  # 是否每次在把数据送进lstm中训练时都混洗

    # 读取词典,把一个字映射到一个id,这个词典是从训练数据中得到的
    word2id = read_dict(os.path.join('.', args['train_data'], 'word2id.pkl'))

    # 随机初始化embedding
    embeddings = init_embedding(word2id, args['embedding_dim'])

    # 设置模型的输出路径
    model_path = 'BLCM3'
    output_path = os.path.join('.', model_path)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    summary_path = os.path.join(output_path, "summaries")
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    model_path = os.path.join(output_path, "checkpoints/")
    #if not os.path.exists(model_path):
    #os.makedirs(model_path)
TEST_FILENAME_TO_SEGMENT_DATA = os.path.join(DIR_PATH, 'data/filename_to_segment_ids.csv')

parser = argparse.ArgumentParser()
parser.add_argument("file_path", type=Path)
p = parser.parse_args()

# Load model
if (p.file_path.exists()):
    model = load_model(p.file_path.as_posix())
    model.summary()
else:
    exit("The given file path does not exist: ", p.file_path)

# Data generator for test
input_dim = 400
partition = read_dict(PARTITION_PATH)
test_generator = BreakfastActionTestDataGenerator(partition['testing'],
                                                  batch_size=1,
                                                  input_dim=input_dim)

# Predict using model (returns probabilities)
print("Getting predictions...")
predictions = model.predict_generator(test_generator,
                                      use_multiprocessing=True,
                                      workers=4,
                                      verbose=2)

model_name = p.file_path.as_posix().split("runs/", 1)[1] # model name will have the .hdf5 extension
timestr = time.strftime("_%Y%m%d_%H%M%S")

# Save raw predictions
Пример #19
0
checkpoint_filename = "./runs/frame-simple-dnn-{epoch:02d}-{val_categorical_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filename,
                             save_best_only=True,
                             mode='min',
                             monitor='val_loss',
                             verbose=1)
callbacks_list = [checkpoint]

# Compile model
model.compile(adagrad,
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
model.summary()

# Load indices for training, testing, and validation
partition = read_dict(PARTITION_PATH)

# Load labels
labels = read_dict(VIDEO_LABELS_PATH)

# Data generators for train/validation
training_generator = BreakfastActionTrainDataGenerator(partition['training'],
                                                       labels=labels,
                                                       batch_size=batch_size,
                                                       input_dim=input_dim,
                                                       output_dim=output_dim,
                                                       shuffle=True)
validation_generator = BreakfastActionTrainDataGenerator(
    partition['validation'],
    labels=labels,
    batch_size=batch_size,
Пример #20
0
def train(profile):
    stt_config = profile.speech_to_text
    intent_config = profile.intent

    # Load sentence templates, write examples
    sentences_ini_path = profile.read_path(stt_config['sentences_ini'])

    # Load from ini file and write to examples file
    words_needed = set()
    sentences_by_intent = defaultdict(list)
    grammars_dir = profile.write_dir(stt_config['grammars_dir'])

    with open(sentences_ini_path, 'r') as sentences_ini_file:
        grammar_paths = jsgf_utils.make_grammars(sentences_ini_file, grammars_dir)

        # intent -> sentence templates
        tagged_sentences = jsgf_utils.generate_sentences(grammar_paths)

        for intent_name, intent_sents in tagged_sentences.items():
            for intent_sent in intent_sents:
                # Template -> untagged sentence + entities
                sentence, entities = utils.extract_entities(intent_sent)

                # Split sentence into words (tokens)
                sentence, tokens = sanitize_sentence(sentence, profile.training)
                sentences_by_intent[intent_name].append((sentence, entities))

                # Collect all used words
                words_needed.update(tokens)

    # Load base and custom dictionaries
    ps_config = stt_config['pocketsphinx']
    base_dictionary_path = profile.read_path(ps_config['base_dictionary'])
    custom_path = profile.read_path(ps_config['custom_words'])

    word_dict = {}
    for word_dict_path in [base_dictionary_path, custom_path]:
        if os.path.exists(word_dict_path):
            with open(word_dict_path, 'r') as dictionary_file:
                utils.read_dict(dictionary_file, word_dict)

    # Add words from wake word if using pocketsphinx
    if profile.wake.get('system') == 'pocketsphinx':
        wake_config = profile.wake['pocketsphinx']
        wake_keyphrase = wake_config['keyphrase']
        _, wake_tokens = sanitize_sentence(wake_keyphrase, profile.training)
        words_needed.update(wake_tokens)

    # Check for unknown words
    unknown_words = words_needed - word_dict.keys()
    unknown_path = profile.read_path(ps_config['unknown_words'])

    if len(unknown_words) > 0:
        with open(unknown_path, 'w') as unknown_file:
            for word in unknown_words:
                result = utils.lookup_word(word, word_dict, profile, n=1)

                pronounces = result['pronunciations']
                phonemes = ' '.join(pronounces)

                # Dictionary uses upper-case letters
                if stt_config.get('dictionary_upper', False):
                    word = word.upper()
                else:
                    word = word.lower()

                print(word.lower(), phonemes, file=unknown_file)

        raise RuntimeError('Training failed due to %s unknown word(s)' % len(unknown_words))

    elif os.path.exists(unknown_path):
        # Remove unknown dictionary
        os.unlink(unknown_path)


    # Write out dictionary with only the necessary words (speeds up loading)
    dictionary_path = profile.write_path(ps_config['dictionary'])
    with open(dictionary_path, 'w') as dictionary_file:
        for word in sorted(words_needed):
            for i, pronounce in enumerate(word_dict[word]):
                if i < 1:
                    print(word, pronounce, file=dictionary_file)
                else:
                    print('%s(%s)' % (word, i+1), pronounce, file=dictionary_file)

    logging.debug('Wrote %s word(s) to %s' % (len(words_needed), dictionary_path))

    # Repeat sentences so that all intents will contain the same number
    balance_sentences = profile.training.get('balance_sentences', True)
    if balance_sentences:
        # Use least common multiple
        lcm_sentences = utils.lcm(*(len(sents) for sents
                                    in sentences_by_intent.values()))
    else:
        lcm_sentences = 0  # no repeats

    # Write sentences to text file
    sentences_text_path = profile.write_path(stt_config['sentences_text'])
    with open(sentences_text_path, 'w') as sentences_text_file:
        num_sentences = 0
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for sentence, slots in intent_sents:
                for i in range(num_repeats):
                    print(sentence, file=sentences_text_file)
                    num_sentences = num_sentences + 1

    logging.debug('Wrote %s sentence(s) to %s' % (num_sentences, sentences_text_path))

    # Generate ARPA language model
    lm = train_speech_recognizer(profile)

    # Save to profile
    lm_path = profile.write_path(ps_config['language_model'])
    with open(lm_path, 'w') as lm_file:
        lm_file.write(lm)

    # Expand sentences for intent recognizer
    intent_system = profile.intent.get('system', 'fuzzywuzzy')

    if intent_system == 'rasa':
        rasa_config = profile.intent[intent_system]

        # Use rasaNLU
        examples_md_path = profile.write_path(rasa_config['examples_markdown'])
        with open(examples_md_path, 'w') as examples_md_file:
            for intent_name, intent_sents in tagged_sentences.items():
                # Rasa Markdown training format
                print('## intent:%s' % intent_name, file=examples_md_file)
                for intent_sent in intent_sents:
                    print('-', intent_sent, file=examples_md_file)

                print('', file=examples_md_file)

        # Train rasaNLU
        project_dir = profile.write_dir(rasa_config['project_dir'])
        project_name = rasa_config['project_name']
        rasa_config_path = profile.read_path(rasa_config['config'])

        train_intent_recognizer(examples_md_path, rasa_config_path,
                                project_dir, project_name)
    else:
        fuzzy_config = profile.intent[intent_system]

        # Use fuzzywuzzy
        examples_path = profile.write_path(fuzzy_config['examples_json'])
        examples = intent.make_examples(profile, sentences_by_intent)
        with open(examples_path, 'w') as examples_file:
            json.dump(examples, examples_file, indent=4)
Пример #21
0
        index = [i for i in range(len(iterm)) if iterm[i] == 1]
        if len(index) == 0:
            result.append(['unk'])
        elif len(index) == 1:
            result.append([label[index[0]]])
        else:
            temp = [label[i] for i in index]
            result.append(['|'.join(temp)])
    return result


if __name__ == "__main__":
    args = get_args()
    df_test = pd.read_csv(args["test_file"])
    test_data = df_test["content"].tolist()
    label = read_dict(args["labeldict"])["label"]
    pred = main(test_data, args, len(label))
    print("pred: ", pred[:2])
    pred_encoder = label_encoder(pred, label)
    print("label encoder: ", pred_encoder[:3])
    pred_df = pd.DataFrame(pred_encoder, columns=["pred"])
    df_test = pd.concat([df_test, pred_df],
                        axis=1)  #df_test.append(pred_df, ignore_index=True)
    df_test = df_test[["label", "pred", "content"]]
    df_test.to_csv("./output/test_predict.csv",
                   index=False,
                   header=True,
                   encoding="utf-8-sig")

    # test_data = ["昨天18:30,陕西宁强县胡家坝镇向家沟村三组发生山体坍塌,5人被埋。当晚,3人被救出,其中1人在医院抢救无效死亡,"
    #              "2人在送医途中死亡。今天凌晨,另外2人被发现,已无生命迹象。"]
Пример #22
0
        统计单字词的个数
        Args:
            words_list: list 单词列表
        Return:
            count: int 单字词个数
        """
        count = 0
        for word in word_list:
            if len(word) == 1:
                count += 1
        return count


if __name__ == '__main__':

    words_dict = read_dict('./data/assign/dic_ce.txt')
    max_len = len(max(words_dict, key=len))
    test = "我正在上自然语言处理课。"
    segment = MaxSegmentation(words_dict, test, max_len)
    fstart = time.time()
    f_result = segment.ForwardMM()
    fend = time.time()
    print("ForwardMM: {}, running time: {} s".format(f_result,
                                                     str(fend - fstart)))
    rstart = time.time()
    r_result = segment.ReverseMM()
    rend = time.time()
    print("ReverseMM: {}, running time: {} s".format(r_result,
                                                     str(rend - rstart)))
    bistart = time.time()
    bi_result = segment.BMM(f_result, r_result)
Пример #23
0
def main():
    args = get_args()
    train_df = pd.read_csv(args["train_file"])
    train_df = shuffle(train_df)
    train_datas = train_df["content"].tolist()

    train_label_total = train_df["label"].unique().tolist()
    print("total data size: {}".format(len(train_datas)))
    # get lable dict
    label_list = read_dict(args["labeldict"])["label"]
    if not os.path.exists(args["labeldict"]):
        for label in train_label_total:
            if "|" in label:
                temp = label.split("|")
                for item in temp:
                    if item not in label_list:
                        label_list.append(item)
            else:
                if label not in label_list:
                    label_list.append(label)
        print("label cate size: {}".format(len(label_list)))
        label_dict = {"label": label_list}
        with open(args["labeldict"], "w", encoding="utf-8") as f:
            f.write(json.dumps(label_dict, ensure_ascii=False, indent=4))

    # label encoder
    train_labels = label_encoder(train_df["label"].tolist(), label_list)

    train_data, val_data, train_label, val_label = train_test_split(
        train_datas, train_labels, test_size=0.2, random_state=0)
    print("train data size: {}".format(len(train_data)))
    print("val data size: {}".format(len(val_data)))

    tokenizer = get_tokenizer(args["bert_model_name"],
                              args["pretrain_model_path"])

    train_x, train_y = get_model_data(train_data, train_label, tokenizer,
                                      args["max_length"])

    val_x, val_y = get_model_data(val_data, val_label, tokenizer,
                                  args["max_length"])
    model = create_model(args["bert_model_name"], len(label_list))

    if not os.path.exists(args["model_path"]):
        os.makedirs(args["model_path"])

    if not os.path.exists(args["pbmodel_path"]):
        os.makedirs(args["pbmodel_path"])

    # 设置保存最优的模型,保存的是pb模型
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            # Path where to save the model
            # The two parameters below mean that we will overwrite
            # the current checkpoint if and only if
            # the `val_loss` score has improved.
            # The saved model name will include the current epoch.
            filepath=args["model_path"],  # {epoch}
            save_best_only=True,  # Only save a model if `val_loss` has improved.
            monitor='val_auc',  # 'accuracy',
            verbose=1,
            mode='max')
    ]

    model.fit(train_x,
              train_y,
              epochs=args["epoch"],
              verbose=1,
              batch_size=args["batch_size"],
              callbacks=callbacks,
              validation_data=(val_x, val_y),
              validation_batch_size=args["batch_size"])

    model_path = os.path.join("./output/model/", "mulclassifition.h5")
    model.save_weights(model_path)

    tf.keras.models.save_model(model,
                               args["pbmodel_path"],
                               save_format="tf",
                               overwrite=True)
Пример #24
0
    if len(argv) == 4:
        tm_file = argv[0]
        test_file = argv[1]
        source_file = argv[2]
        target_file = argv[3]

    else:
        print str(err)
        usage(1)

    print "Loading the translation matrix"
    tm = np.loadtxt(tm_file)

    print "Reading the test data"
    test_data = read_dict(test_file)

    #in the _source_ space, we only need to load vectors for the words in test.
    #semantic spaces may contain additional words, ALL words in the _target_
    #space are used as the search space
    source_words, _ = zip(*test_data)
    source_words = set(source_words)

    print "Reading: %s" % source_file
    if not additional:
        source_sp = Space.build(source_file, source_words)
    else:
        #read all the words in the space
        lexicon = set(
            np.loadtxt(source_file,
                       skiprows=1,
Пример #25
0
#! /usr/bin/env python3

import csv

from utils import read_dict, write_md

filename = "dict.csv"

rows = read_dict(filename)
write_md("dict.md", rows)
PARTITION_PATH = os.path.join(DIR_PATH, 'data/segment_partition.csv')

parser = argparse.ArgumentParser()
parser.add_argument("file_path", type=Path)
p = parser.parse_args()

# Load model
if (p.file_path.exists()):
    model = load_model(p.file_path.as_posix())
    model.summary()
else:
    exit("The given file path does not exist: ", p.file_path)

# Data generator for test
input_dim = 400
partition = read_dict(PARTITION_PATH)
test_generator = BreakfastActionTestDataGenerator(partition['testing'],
                                                  batch_size=1,
                                                  input_dim=input_dim)

# Predict using model (returns probabilities)
print("Getting predictions...")
predictions = model.predict_generator(test_generator,
                                      use_multiprocessing=True,
                                      workers=4,
                                      verbose=2)

# Save raw predictions
model_name = p.file_path.as_posix().split("runs/", 1)[1] # model name will have the .hdf5 extension
timestr = time.strftime("%Y%m%d_%H%M%S")
print("Writing predictions...")
Пример #27
0
def main(sys_argv):

    try:
        opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
                                   ["help", "output=", "correction="])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(1)

    out_file = "./translated_vecs"
    additional = None
    for opt, val in opts:
        if opt in ("-o", "--ouput"):
            out_file = val
        if opt in ("-c", "--correction"):
            try:
                additional = int(val)
            except ValueError:
                usage(1)
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            usage(1)

    if len(argv) == 4:
        tm_file = argv[0] 
        test_file = argv[1]
        source_file = argv[2]	
        target_file = argv[3]
    else:
#        print(str(err))
        usage(1)

    print("Loading the translation matrix")
    tm = np.loadtxt(tm_file)

    print("Reading the test data")
    test_data = read_dict(test_file)

    #in the _source_ space, we only need to load vectors for the words in test.
    #semantic spaces may contain additional words, ALL words in the _target_ 
    #space are used as the search space
    source_words, _ = zip(*test_data)
    source_words = set(source_words)

    print("Reading: %s" % source_file)
    if not additional:
        source_sp = Space.build(source_file, source_words)
    else:
        #read all the words in the space
        lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 
                                    comments=None, usecols=(0,)).flatten())
        #the max number of additional+test elements is bounded by the size 
        #of the lexicon
        additional = min(additional, len(lexicon) - len(source_words))
        #we sample additional elements that are not already in source_words
        random.seed(100)
        lexicon = random.sample(list(lexicon.difference(source_words)), additional)
        
        #load the source space
        source_sp = Space.build(source_file, source_words.union(set(lexicon)))
    
    source_sp.normalize()

    print("Reading: %s" % target_file)
    target_sp = Space.build(target_file)
    target_sp.normalize()

    print("Translating") #translates all the elements loaded in the source space
    mapped_source_sp = apply_tm(source_sp, tm)
    
    print("Retrieving translations")
    test_data = get_valid_data(source_sp, target_sp, test_data)

    #turn test data into a dictionary (a word can have mutiple translation)
    gold = collections.defaultdict(set)
    for k, v in test_data:
        gold[k].add(v)

    score(mapped_source_sp, target_sp, gold, additional)

    print("Printing mapped vectors: %s" % out_file)
    np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
    np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")
Пример #28
0
checkpoint_filename = "./runs/segment-lstm-{epoch:02d}-{val_categorical_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filename,
                             monitor='val_loss',
                             save_best_only=True,
                             mode='min',
                             verbose=1)
callbacks_list = [checkpoint]

# Compile model (use default Adam optimizer)
model.compile('adam',
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
model.summary()

# Load indices for training, testing, and validation
partition = read_dict(PARTITION_PATH)

# Load labels
labels = read_dict(SEGMENT_LABELS_PATH)

# Data generators for train/validation
training_generator = BreakfastActionTrainDataGenerator(partition['training'],
                                                       labels=labels,
                                                       batch_size=batch_size,
                                                       input_dim=input_dim,
                                                       output_dim=output_dim,
                                                       shuffle=True)
validation_generator = BreakfastActionTrainDataGenerator(
    partition['validation'],
    labels=labels,
    batch_size=batch_size,
Пример #29
0
def main(sys_argv):
    try:
        opts, argv = getopt.getopt(sys_argv[1:], "ho:c:l:m:1:2:t:a:v:", [
            "help", "output=", "correction=", "levenshtein=", "matrix=", "1=",
            "2=", "topK=", "alpha=", "verbosity="
        ])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(1)

    out_file = "./translated_vecs"
    additional = None
    levcosts = {}
    for opt, val in opts:
        # print(opt+'='+val)
        if opt in ("-o", "--ouput"):
            out_file = val
        elif opt in ("-l", "--levenshtein"):
            levcosts = u.readcosts(val)
        elif opt in ("-m", "--matrix"):
            tm_file = val
        elif opt == '-1':
            source_file = val
        elif opt == '-2':
            target_file = val
        elif opt in ("-c", "--correction"):
            try:
                additional = int(val)
            except ValueError:
                print("additional: %s" % val)
                usage(1)
        elif opt in ("-t", "--topK"):
            try:
                u.topK = int(val)
            except ValueError:
                print("topK: %s" % val)
                usage(1)
        elif opt in ("-v", "--verbosity"):
            try:
                u.verbosity = int(val)
            except ValueError:
                print("verbosity: %s" % val)
                usage(1)
        elif opt in ("-a", "--alpha"):
            try:
                u.alpha = float(val)
            except ValueError:
                print("alpha: %s" % val)
                usage(1)
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            print("Unknown option: -%s %s" % (opt, val))
            usage(1)

    if len(argv) == 1:
        test_file = argv[0]
    else:
        print('Unused arguments:')
        print(argv)
        usage(1)

    #if u.verbosity>0: # always log the parameters in the output
    sys.stdout.write(sys_argv[0] + " ")
    for opt, val in opts:
        sys.stdout.write(opt + " " + val + " ")
    print(test_file)

    if u.verbosity > 1:
        print("Loading the translation matrix %s " % tm_file)
    tm = np.loadtxt(tm_file)

    if u.verbosity > 1:
        print("Reading the test data %s " % test_file)
    test_data = u.read_dict(test_file)

    #in the _source_ space, we only need to load vectors for the words in test.
    #semantic spaces may contain additional words, ALL words in the _target_
    #space are used as the search space
    source_words, _ = zip(*test_data)
    source_words = set(source_words)

    if u.verbosity > 1:
        print("Reading: %s" % source_file)

    if not additional:
        source_sp = Space.build(source_file, source_words)
    else:
        #read all the words in the space
        with io.open(source_file, 'r', encoding='utf8') as f:
            lexicon = set([l.split(' ')[0] for l in f])
        # lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
        #                             comments=None, usecols=(0,)).flatten())
        #the max number of additional+test elements is bounded by the size
        #of the lexicon
        additional = min(additional, len(lexicon) - len(source_words))
        #we sample additional elements that are not already in source_words
        random.seed(100)
        if additional > 0:
            lexicon = random.sample(list(lexicon.difference(source_words)),
                                    additional)

        #load the source space
        source_sp = Space.build(source_file, source_words.union(set(lexicon)))

    source_sp.normalize()

    if u.verbosity > 1:
        print("Reading: %s" % target_file)
    target_sp = Space.build(target_file)
    target_sp.normalize()

    if u.verbosity > 1:
        print("Retrieving translations")
    test_data = u.get_valid_data(source_sp, target_sp, test_data)

    #turn test data into a dictionary (a word can have mutiple translation)
    gold = collections.defaultdict(set)
    for k, v in test_data:
        gold[k].add(v)

    if u.verbosity > 1:
        print("Translating"
              )  #translates all the elements loaded in the source space
    source_sp = u.apply_tm(source_sp, tm)

    u.score(source_sp, target_sp, gold, additional, levcosts)
    print("Printing mapped vectors: %s" % out_file)
    np.savetxt("%s.vecs.txt" % out_file, source_sp.mat)
    #    np.savetxt("%s.wds.txt" % out_file, source_sp.id2row, fmt="%s")  # no utf8
    with open("%s.wds.txt" % out_file, "w") as outf:
        for s in source_sp.id2row:
            print(s, file=outf)
Пример #30
0
        line = line.strip()
        if (len(line) == 0) or line.startswith('#'):
            continue  # skip blanks and comments

        parts = line.split(' ', maxsplit=1)
        phonemes[parts[0]] = parts[1]

# Load dictionaries
dictionary_files = config['training']['dictionary_files']
word_dict = defaultdict(set)
for dict_path in dictionary_files:
    if not os.path.exists(dict_path):
        continue

    logging.debug('Loading dictionary from %s' % dict_path)
    read_dict(dict_path, word_dict)

# ---------------------------------------------------------------------

# Create web server
app = Flask('rhasspy', template_folder=os.path.join('web', 'templates'))
app.secret_key = str(uuid4())

# Automatically reload template files if they're changed on disk.
# Used for debugging/development.
app.config['TEMPLATES_AUTO_RELOAD'] = True

# ---------------------------------------------------------------------
# Static Routes
# ---------------------------------------------------------------------
            usage(0)
        else:
            usage(1)

    if len(argv) == 5:
        source_file = argv[1]
        target_file = argv[2]
        test_file = argv[3]
        model = eval(argv[4])
        dict_file = argv[0]
    else:
        print str(err)
        usage(1)

    print "Reading the training data"
    train_data = read_dict(dict_file)
    print train_data
    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print "Reading: %s" % source_file
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print "Learning the translation matrix"
    tm = train_tm_model(source_sp, target_sp, train_data, model)
Пример #32
0
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            usage(1)

    if len(argv) == 3:
        source_file = argv[1]	
        target_file = argv[2]
	dict_file = argv[0]
    else:
	print str(err)
	usage(1)


    print "Reading the training data"
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print "Reading: %s" % source_file
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print "Learning the translation matrix"
    print "Training data: %s" % str(train_data)