Exemplo n.º 1
0
 def forward(self,Y,h,c, outEncoder,teacher_force):# Y это кол-во символов умножить на 256
     if (np.random.rand()>teacher_force):
         seq_len=Y.shape[0]-1
         output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48)))
         Y = self.embedding(Y)
         for  i in range(len(Y)-1): # -1 так как sos не учитывем в criterion
             h[0],c[0] = self.lstm1(Y[i],(h[0].clone(),c[0].clone()))
             h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone()))
             h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone()))
             h2=h[2].clone()
             context = self.attention(h2, outEncoder,BATCH_SIZE)
             context =  torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) )
            # print("context",context.shape) # torch sueeze
             output_decoder[i] = self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 ))    
     else:
         seq_len=Y.shape[0]-1
         output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48)))
         alphabet = Alphabet()
         Y_cur = self.embedding( load_to_cuda(Variable(torch.LongTensor([alphabet.ch2index('<sos>')]))) ).view(1,self.hidden_size)
         for  i in range(seq_len-1):
             Y_cur=Y_cur.expand(BATCH_SIZE,self.hidden_size)
             h[0],c[0] = self.lstm1(Y_cur,(h[0].clone(),c[0].clone()))
             h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone()))
             h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone()))
             h2 = h[2].clone()
             context = self.attention(h2, outEncoder,BATCH_SIZE)
             context = torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) )
             output_decoder[i]  =  self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 ))
             argmax = torch.max(output_decoder[i][0],dim=0)
             Y_cur=self.embedding( Variable(load_to_cuda(torch.LongTensor([argmax[1][0].data[0]]))) ).view(1,self.hidden_size)
     return output_decoder 
Exemplo n.º 2
0
 def __init__(self):
     self.categories = Categories()
     self.categories.load()
     self.alphabet = Alphabet()
     self.alphabet.load()
     self.responses = []
     self.nextRound()
Exemplo n.º 3
0
class Round():
    def __init__(self):
        self.categories = Categories()
        self.categories.load()
        self.alphabet = Alphabet()
        self.alphabet.load()
        self.responses = []
        self.nextRound()


    def allResponses(self):
        return [d['response'] for d in self.responses]

    def getResponse(self, ptn):
        log( 'getResponse for ' + ptn )
        try:
            pr = [d for d in self.responses if d['tn'] == ptn]
            return pr[0]
        except Exception as e:
            return { 'tn': ptn, 'valid': False, 'response': 'UNK' }

    def nextRound(self):
        self.cat_index = randint( 0, len(self.categories.data)-1)
        log( self.cat_index)
        self.alpha_index = randint( 0, len(self.alphabet.data)-1)
        log( self.alpha_index )
        self.responses = []

    def describe(self):
        alpha = self.alphabet.data[self.alpha_index]
        return  self.categories.data[self.cat_index]['category'] + " that " + alpha['position'].lower() + " " + alpha['letter']
Exemplo n.º 4
0
 def __init__(self,
              data,
              encoding="utf-8",
              feature_alphabet=None,
              alphabet_pop=True,
              alphabet_lock=True,
              sep=":",
              bias=False,
              bias_prefix="@@BIAS@@"):
     Source.__init__(self, data, encoding=encoding)
     self._Instance = BinaryClassificationInstance
     if feature_alphabet != None:
         self._feature_alphabet = feature_alphabet
     else:
         self._feature_alphabet = Alphabet(locked=False)
     self._sep = sep
     self._bias = bias
     self._bias_prefix = bias_prefix
     if alphabet_pop:
         self._populate_alphabet()
     if alphabet_lock:
         self.lock_alphabet()
     else:
         self.unlock_alphabet()
     return
Exemplo n.º 5
0
 def __init__(self):
     self.states = State()
     self.sigma = Alphabet()
     self.delta = list()
     self.delta_nfa = list()
     self.initial_state = None
     self.final_state = list()
Exemplo n.º 6
0
 def __init__(self):
     super(VsmNormer, self).__init__()
     self.word_alphabet = Alphabet('word')
     self.embedding_dim = None
     self.word_embedding = None
     self.dict_alphabet = Alphabet('dict')
     self.dict_embedding = None
     self.gpu = opt.gpu
Exemplo n.º 7
0
 def test_cross_off_adds_guessed_letter_to_list_of_guessed_letters(self):
     # arrange
     alphabet = Alphabet()
     letter = "a"
     # act
     alphabet.cross_off(letter)
     # assert
     assert letter in alphabet.guessed_letters
Exemplo n.º 8
0
 def test_already_guessed_returns_true_if_letter_guessed(self):
     # arrange
     alphabet = Alphabet()
     letter = "h"
     alphabet.cross_off(letter)
     # act
     result = alphabet.already_guessed("h")
     # assert
     assert result is True
Exemplo n.º 9
0
 def __init__(self, args):
     self.config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
     if args.dataset not in self.config['data_list']:
         raise KeyError("No such dataset named {}.".format(args.dataset))
     self.config['dataset'] = args.dataset
     self.datatype = 'binary'
     if self.config['dataset'] in self.config['datatype']['train_test']:
         self.datatype = 'train_test'
     self.alphabet = Alphabet('word')
     self.set_seed()
Exemplo n.º 10
0
def get_word(seq): # seq-числа
    #print(seq)
    alphabet=Alphabet()
    s=""
    if len(seq)==0:
        return s
    for el in seq:
        #print("el:",el.data)
        s+=alphabet.index2ch(el)
    return s
Exemplo n.º 11
0
    def train(self, bitextGen):
        self.frenchAlphabet = Alphabet.from_iterable(
            word for frSent, enSent in bitextGen(desc='French Alphabet')
            for word in frSent)
        self.englishAlphabet = Alphabet.from_iterable(
            word for frSent, enSent in bitextGen(desc='English Alphabet')
            for word in enSent)
        self.frenchAlphabet.freeze()
        self.englishAlphabet.freeze()
        vF = len(self.frenchAlphabet)
        vE = len(self.englishAlphabet)
        tOfEGivenF = np.ones((vE, vF)) / vF
        aOfIJGivenLenELenF = AlignmentDict()
        for ep in tqdm(range(self.epochs), desc='Epoch'):
            countOfEGivenF = np.zeros((vE, vF))
            totalOfF = np.zeros(vF)
            countOfIGivenJ = AlignmentDict()
            totalOfJ = CountDict()
            for frSent, enSent in bitextGen('Training'):
                # Compute Normalization stuff
                lenF = len(frSent)
                frMask = self.frenchAlphabet.map(frSent)

                lenE = len(enSent)
                enMask = self.englishAlphabet.map(enSent)

                aOfIJ = aOfIJGivenLenELenF[lenE, lenF]

                # total probability of each english word being translated from the french ones
                # has size of {len(enSent) x 1}
                sTotalOfE = np.sum(tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ,
                                   axis=1,
                                   keepdims=True)

                # calculate counts

                delta = tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ / sTotalOfE
                deltaSummedOverE = np.sum(delta, axis=0)

                countOfEGivenF[np.ix_(enMask, frMask)] += delta
                totalOfF[frMask] += deltaSummedOverE

                countOfIGivenJ[lenE, lenF] += delta
                totalOfJ[lenE, lenF] += deltaSummedOverE

            # estimate probabilities
            tOfEGivenF = countOfEGivenF / totalOfF
            for lenE, lenF in aOfIJGivenLenELenF:
                aOfIJGivenLenELenF[
                    lenE,
                    lenF] = countOfIGivenJ[lenE, lenF] / totalOfJ[lenE, lenF]

        self.tOfEGivenF = tOfEGivenF
        self.aOfIJGivenLenELenF = aOfIJGivenLenELenF
def map_string_2_id_open(string_list, name):
    string_id_list = []
    alphabet_string = Alphabet(name)
    for strings in string_list:
        ids = []
        for string in strings:
            id = alphabet_string.get_index(string)
            ids.append(id)
        string_id_list.append(ids)
    alphabet_string.close()
    return string_id_list, alphabet_string
Exemplo n.º 13
0
def _alphabet_from_rdata(rdata, void_label, dummy_label):
    """Extract alphabet (of observations and labels) from
    given raw data."""
    alphabet = Alphabet(void_label=void_label, dummy_label=dummy_label)
    for (sent, labels) in rdata:
        for word in sent:
            for x in chain(*word):
                alphabet.add_observation(x)
        for y in labels:
            alphabet.add_label(y)
    return alphabet
Exemplo n.º 14
0
    def time_stamp_calc(self):
        time = floor(self.creation_time)
        # choosing a seed value to compare, in this case, the date im writing this code
        seed = 10012019
        alpha = Alphabet()
        alpha.shuffle()
        index = (time % seed)
        # digit_one = alpha[temp_time % seed]
        # alpha = alpha.shuffle()
        # digit_two = alpha[temp_time % seed]

        return index
Exemplo n.º 15
0
def load_config_pos(config_path, char_embedd_dim):
    max_sent_length, max_char_length, num_labels, embedd_dim_concat = load_config(config_path)
    alphabet_char = Alphabet('char', keep_growing=False)
    alphabet_char.load(config_path, 'alphabet_char')
    alphabet_label = Alphabet('label', keep_growing=False)
    alphabet_label.load(config_path, 'alphabet_label')
    scale = np.sqrt(3.0 / char_embedd_dim)
    char_embedd_table = np.random.uniform(-scale, scale, [alphabet_char.size(), char_embedd_dim]).\
        astype(theano.config.floatX)
    return max_sent_length, max_char_length, num_labels, embedd_dim_concat, alphabet_char, alphabet_label, \
           char_embedd_table
Exemplo n.º 16
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)
        A.add_many(f for token in x.sequence for f in token.attributes)
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Exemplo n.º 17
0
def main():
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    outdir = HOME_DIR + '_' + input_fname
    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    ddir = 'semeval/binary'
    train16 = "task-BD-train-2016.tsv"
    dev2016 = "task-BD-dev-2016.tsv"
    devtest2016 = "task-BD-devtest-2016.tsv"
    test2016 = "SemEval2016-task4-test.subtask-BD.txt"

    fname_vocab = os.path.join(outdir, 'vocab.pickle')
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_idx = alphabet.fid
    print "alphabet", len(alphabet)
    print 'dummy_word:', dummy_word_idx

    topic_alphabet = Alphabet(start_feature_id=0)
    topic_alphabet.add('UNKNOWN_TOPIC_IDX')
    dummy_topic_idx = topic_alphabet.fid

    print "Loading Semeval Data"
    #save semeval tweets seperate
    files = [train16, dev2016, devtest2016, test2016]
    for fname in files:
        fname_ext = os.path.join(ddir, fname)
        tid, topics, tweets, sentiments = load_data(fname_ext, topic_alphabet)
        print "Number of tweets:", len(tweets)

        tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx)
        topic_idx = get_topic_indices(tweets, topics, topic_alphabet)

        basename, _ = os.path.splitext(os.path.basename(fname))
        np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid)
        np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)),
                tweet_idx)
        np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)),
                sentiments)
        np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)),
                topic_idx)

    cPickle.dump(
        topic_alphabet,
        open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
Exemplo n.º 18
0
    def __init__(self, config, alphabet: Alphabet, emb_dim, device):
        super(TextCNN, self).__init__()
        self.config = config
        self.embeddings = nn.Embedding(alphabet.size(), emb_dim)
        # self.embeddings.weight.requires_grad = False
        if config['train_mode'] == 'static':
            self.embeddings = self.embeddings.from_pretrained(
                torch.from_numpy(alphabet.pretrained_emb))
        elif config['train_mode'] == 'fine-tuned':
            self.embeddings.weight.data.copy_(
                torch.from_numpy(alphabet.pretrained_emb))

        filters = config['filters']
        self.cnn = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(1, config['output_channels'], [w, emb_dim]),
                nn.ReLU(), nn.AdaptiveMaxPool2d(1)) for w in filters
        ])

        self.linear = nn.Linear(config['output_channels'] * len(filters),
                                2,
                                bias=True)
        self.dropout = nn.Dropout(config['dropout'])
        self.relu = nn.ReLU()
        self.scale = np.sqrt(3.0 / emb_dim)
        self.apply(self._init_esim_weights)
Exemplo n.º 19
0
 def initial_feature_alphabets(self):
     items = open(self.train_dir, 'r').readline().strip('\n').split()
     print items
     total_column = len(items)
     if total_column > 2:
         for idx in range(1, total_column - 1):
             feature_prefix = items[idx].split(']', 1)[0] + "]"
             print "feature_prefix:{}".format(feature_prefix)
             self.feature_alphabets.append(Alphabet(feature_prefix))
             self.feature_name.append(feature_prefix)
             print "Find feature: ", feature_prefix
     self.feature_num = len(self.feature_alphabets)
     self.pretrain_feature_embeddings = [None] * self.feature_num
     self.feature_emb_dims = [20] * self.feature_num
     self.feature_emb_dirs = [None] * self.feature_num
     self.norm_feature_embs = [False] * self.feature_num
     self.feature_alphabet_sizes = [0] * self.feature_num
     if self.feat_config:
         for idx in range(self.feature_num):
             if self.feature_name[idx] in self.feat_config:
                 self.feature_emb_dims[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_size']
                 self.feature_emb_dirs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_dir']
                 self.norm_feature_embs[idx] = self.feat_config[
                     self.feature_name[idx]]['emb_norm']
Exemplo n.º 20
0
    def initial_feature_alphabets(self):
        for l in open(self.train_dir, 'r').readlines():
            if not l.startswith("#") and not l.startswith("-BOS-"):
                items = l.strip("\n").split()
                break

        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = items[idx].split(']', 1)[0] + "]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)

        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [self.HP_feature_default_size
                                 ] * self.feature_num
        #self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_norm']
Exemplo n.º 21
0
    def __init__(self):
        self.name2id = {}  # preferred name -> id
        self.id2name = {}  # id -> CTD_Term
        self.altid2id = {}  # alternative id -> id

        if opt.method == 'cla':
            self.id_alphabet = Alphabet('id')
Exemplo n.º 22
0
 def test_given_alphabet_as_int_returns_error(self):
     test_data = 123456
     try:
         Alphabet('Test', test_data)
         self.assertFalse(True, "Expected exception")
     except:
         return
Exemplo n.º 23
0
def main():
    UPPER_STRING = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    testAlphabet = Alphabet(UPPER_STRING)
    permutation1 = Permutation(
        "(AELTPHQXRU) (BKNW) (CMOY) (DFG) (IV) (JZ) (S)", testAlphabet)
    permutation2 = Permutation(
        "(FIXVYOMW) (CDKLHUP) (ESZ) (BJ) (GR) (NT) (A) (Q)", testAlphabet)
    permutation3 = Permutation("(ABDHPEJT) (CFLVMZOYQIRWUKXSG) (N)",
                               testAlphabet)
    permutation4 = Permutation("(AEPLIYWCOXMRFZBSTGJQNH) (DV) (KU)",
                               testAlphabet)
    permutation5 = Permutation(
        "(AE) (BN) (CK) (DQ) (FU) (GY) (HW) (IJ) (LO) (MP) (RX) (SZ) (TV)",
        testAlphabet)

    rotor1 = Rotor("I", permutation1, "TG")
    rotor2 = Rotor("II", permutation2, "A")
    rotor3 = Rotor("III", permutation3, "B")
    rotor4 = Rotor("IV", permutation4, "XO")
    reflector = Reflector("A", permutation5)

    rotors = [reflector, rotor4, rotor3, rotor2, rotor1]

    machine = Machine(testAlphabet, 5, 6, rotors)
    machine.insertRotors(["A", "IV", "III", "II", "I"])
    machine.setRotors("AAAA")

    message = input("What to convert:")
    print(machine.convertMsg(message))
Exemplo n.º 24
0
    def __init__(self, frame_dir):
        self.frame_dir = frame_dir
        self.alphabet = Alphabet()
        # self.words = [name for name in os.listdir(FRAME_DIR)]

        # для сквозного прохода по папкам с видео
        self.words = []
        for root, dirs, files in os.walk(self.frame_dir):
            if not dirs:
                self.words.append(root)

            # print('root: ', root)
            # print('dirs: ', dirs)
            # print('files: ', files)

    # print(self.words)
        self.count = 0
Exemplo n.º 25
0
    def __init__(self, opt):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        self.train_texts = None
        self.train_Ids = None
        self.dev_texts = None
        self.dev_Ids = None
        self.test_texts = None
        self.test_Ids = None

        self.pretrain_word_embedding = None
        self.word_emb_dim = opt.word_emb_dim

        self.config = self.read_config(opt.config)
        self.feat_config = None

        the_item = 'ner_feature'
        if the_item in self.config:
            self.feat_config = self.config[the_item]  ## [POS]:{emb_size:20}
            self.feature_alphabets = []
            self.feature_emb_dims = []
            for k, v in self.feat_config.items():
                self.feature_alphabets.append(Alphabet(k))
                self.feature_emb_dims.append(int(v['emb_size']))
Exemplo n.º 26
0
def decode_all(manifests):
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
        keep_transcription_text=True)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    alphabet = Alphabet(args.vocab_path)
    ds2_model.logger.info("start decoding with extended output...")
    ds2_model.init_ext_scorer(args.alpha, args.beta,
                              args.lang_model_path, args.trie_path,
                              alphabet)

    for audioname, manifest_path, duration, offset in manifests:
        try:
            duration_f = float(duration)
            if duration_f < 1.:
                yield (audioname, manifest_path,
                       None, duration, offset)
                continue
        except (TypeError, ValueError):
            pass
        batch_reader = data_generator.batch_reader_creator(
            manifest_path=manifest_path,
            batch_size=args.num_samples,
            min_batch_size=1,
            sortagrad=False,
            shuffle_method=None)

        for decode_data in batch_reader():
            probs_split = ds2_model.infer_batch_probs(
                infer_data=decode_data,
                feeding_dict=data_generator.feeding)

            # note: we only perform single file decoding
            result_transcript = ds2_model.decode_beam_search(
                probs_split=probs_split,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                alphabet=alphabet)

            yield (audioname, manifest_path,
                   result_transcript, duration, offset)
Exemplo n.º 27
0
    def __data_generation(self, list_IDs_temp, indexes):
        'Generates data containing batch_size samples'
        # X : (n_samples, *dim, n_channels)
        # Initialization
        X = []

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            X.append(self.load_audio(join(self.path_prefix, ID)))

        X = self.pad_and_transpose(X)
        X = self.align(X)

        alphabet = Alphabet("alphabet.txt")
        #lyrics = alphabet.get_batch_labels(self.fetch_lyrics(self.labels))
        lyrics = alphabet.get_batch_labels(self.labels[indexes])
        #lyrics = self.align(lyrics)

        return (X, lyrics)
Exemplo n.º 28
0
 def __init__(self, input_file):
     self.original_data = open(input_file, 'r').readlines()
     self.index_data = []
     self.word_alphabet = Alphabet('word')
     self.gloss_alphabet = Alphabet('gloss')
     self.entity_alphabet = Alphabet('entity')
     self.gaz_alphabet = Alphabet('gaz')
     self.label_alphabet = Alphabet('label')
     self.word_alphabet_size = 0
     self.gloss_alphabet_size = 0
     self.entity_alphabet_size = 0
     self.gaz_alphabet_size = 0
     self.label_alphabet_size = 0
     ### hyperparameters
     self.HP_iteration = 100
     self.HP_batch_size = 1
     self.HP_gaz_hidden_dim = 50
     self.HP_lstm_hidden_dim = 200
     self.HP_dropout = 0.5
     self.gaz_dropout = 0.5
     self.HP_lstm_layer = 1
     self.HP_bilstm = False
     self.HP_use_entity = False
     self.HP_use_gloss = True
     self.HP_use_gaz = False
     self.HP_gpu = True
     self.HP_lr = 0.015
     self.HP_lr_decay = 0.05
     self.HP_clip = 5.0
     self.HP_momentum = 0
     self.HP_iteration = 100
     # embedding hyperparameter
     self.word_emb_dim = 200
     self.entity_emb_dim = 50
     self.gloss_features = "CNN"  #["CNN","LSTM"]
     self.gloss_emb_dim = 200
     self.gloss_hidden_dim = 300
     self.pretrain_word_embedding = np.array([])
     self.pretrain_gaz_embedding = None
     self.word_embed_path = "../LOVECC/NYM.6B.200d.txt"  #"NYM_200.txt"
     self.gaz_embed_path = None
     self.gaz_emb_dim = 200
     self.HP_fix_gaz_emb = True
Exemplo n.º 29
0
 def test_str_representation_does_not_show_hidden_letters(self):
     # arrange
     alphabet = Alphabet()
     word = Word(alphabet)
     word.word_to_guess = "aardvark"
     word.guess_letter("a")
     # act
     hidden_word = str(word)
     # assert
     assert hidden_word == "aa___a__"
Exemplo n.º 30
0
def make_alphabet():
    alphabet = Alphabet(0)
    load_dataset("%s/%s.train.txt" % (data, dataset), alphabet)
    load_dataset("%s/%s.valid.txt" % (data, dataset), alphabet)
    if dataset == 'ptb':
        load_dataset("%s/%s.test.txt" % (data, dataset), alphabet)
    # add all the words in all three dataset

    print("%s: total %d words" % (dataset, len(alphabet)))
    pickle.dump(alphabet, open("%s/alphabet.pkl" % data, "wb"))
Exemplo n.º 31
0
def main():
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    outdir = HOME_DIR + '_' + input_fname
    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    ddir = 'semeval/binary'
    train16 = "task-BD-train-2016.tsv"
    dev2016 = "task-BD-dev-2016.tsv"
    devtest2016 = "task-BD-devtest-2016.tsv"
    test2016 = "SemEval2016-task4-test.subtask-BD.txt"

    fname_vocab = os.path.join(outdir, 'vocab.pickle')
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_idx = alphabet.fid
    print "alphabet", len(alphabet)
    print 'dummy_word:',dummy_word_idx

    topic_alphabet = Alphabet(start_feature_id=0)
    topic_alphabet.add('UNKNOWN_TOPIC_IDX')
    dummy_topic_idx = topic_alphabet.fid

    print "Loading Semeval Data"
    #save semeval tweets seperate
    files = [train16,dev2016,devtest2016,test2016]
    for fname in files:
        fname_ext = os.path.join(ddir,fname)
        tid,topics,tweets, sentiments = load_data(fname_ext,topic_alphabet)
        print "Number of tweets:",len(tweets)

        tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx)
        topic_idx = get_topic_indices(tweets,topics,topic_alphabet)

        basename, _ = os.path.splitext(os.path.basename(fname))
        np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid)
        np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx)
        np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments)
        np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx)

    cPickle.dump(topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
Exemplo n.º 32
0
    def __init__(self, item_path, sub_item_path, pair_path, split_c=','):
        self.__dict__.update(locals())

        print('Loading title and category information...', time.ctime())
        sub_item_set = set()
        for line in open(sub_item_path).readlines():
            sub_item_set.add(line.split()[0])
        self.item_title = {}
        self.item_cat = {}
        self.cat2idx = {}
        self.max_len = 0
        sentence_list = []
        for line in open(item_path).readlines():
            tmp = line.split()
            item = tmp[0]
            cat = tmp[1]
            if cat not in self.cat2idx:
                self.cat2idx[cat] = len(self.cat2idx)
            title = tmp[2].split(split_c)
            self.item_title[item] = title
            self.item_cat[item] = self.cat2idx[cat]
            if item in sub_item_set:
                sentence_list.append(title)
                self.max_len = min(config.max_len, max(self.max_len,
                                                       len(title)))
        print(('%s items' % len(sentence_list)), time.ctime())

        print('Generating alphabet...', time.ctime())
        self.alphabet = Alphabet()
        add_to_vocab(sentence_list, self.alphabet)
        print(('%s words' % len(self.alphabet)), time.ctime())

        print('Generating weight from word2vec model...', time.ctime())
        self.sentence_list = sentence_list
        w2v_model = word2vec(sentence_list)
        self.w2v_weight = np.zeros((len(self.alphabet), config.w2vSize))
        for word, idx in self.alphabet.iteritems():
            if word in w2v_model.vocab:
                self.w2v_weight[idx] = w2v_model[word]

        print('Loading pairs ...', time.ctime())
        self.pair_list = open(pair_path).readlines()
Exemplo n.º 33
0
    def initial_feature_alphabets(self):

        feature_prefix = '[Cap]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 0

        feature_prefix = '[POS]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 1

        self.feature_num = len(self.feature_alphabets)
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']
Exemplo n.º 34
0
def pad(s, destination_size=None):
    """Pad a string using different whitespace characters to stop Twitter
    from thinking two tweets are the same.

    Will try to add 10% whitespace to the string.
    """
    if not destination_size:
        destination_size = min(len(s) + max(int(len(s)*0.1), 5), 140)
    padding = ''
    for i in range(len(s), destination_size):
        padding += Alphabet.random_whitespace()
    return s + padding
Exemplo n.º 35
0
	def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ):
		Source.__init__(self, data, encoding=encoding)
		self._Instance = BinaryClassificationInstance
		if feature_alphabet != None:
			self._feature_alphabet = feature_alphabet
		else:
			self._feature_alphabet = Alphabet(locked=False)
		self._sep = sep
		self._bias = bias
		self._bias_prefix = bias_prefix
		if alphabet_pop:
			self._populate_alphabet()
		if alphabet_lock:
			self.lock_alphabet()
		else:
			self.unlock_alphabet()
		return
Exemplo n.º 36
0
def main():
    data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz'
    output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy'
    output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy'
    outdir = 'parsed_tweets'

    alphabet_words = Alphabet(start_feature_id=0)
    alphabet_words.add('UNKNOWN_WORD_IDX')
    alphabet_words.add('DUMMY_WORD_IDX')
    dummy_word_idx = DUMMY_WORD_IDX

    alphabet_hashtags = Alphabet(start_feature_id=0)
    alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX')

    inp = 'train'
    store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))
    inp = 'test'
    store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))

    cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w'))
    cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))
Exemplo n.º 37
0
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30):
    """
    generate data for charaters
    :param sentences_train:
    :param sentences_dev:
    :param sentences_test:
    :param max_sent_length:
    :return: C_train, C_dev, C_test, char_embedd_table
    """

    def get_character_indexes(sentences):
        index_sentences = []
        max_length = 0
        for words in sentences:
            index_words = []
            for word in words:
                index_chars = []
                if len(word) > max_length:
                    max_length = len(word)

                for char in word[:MAX_CHAR_LENGTH]:
                    char_id = char_alphabet.get_index(char)
                    index_chars.append(char_id)

                index_words.append(index_chars)
            index_sentences.append(index_words)
        return index_sentences, max_length

    def construct_tensor_char(index_sentences):
        C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32)
        word_end_id = char_alphabet.get_index(word_end)

        for i in range(len(index_sentences)):
            words = index_sentences[i]
            sent_length = len(words)
            for j in range(sent_length):
                chars = words[j]
                char_length = len(chars)
                for k in range(char_length):
                    cid = chars[k]
                    C[i, j, k] = cid
                # fill index of word end after the end of word
                C[i, j, char_length:] = word_end_id
            # Zero out C after the end of the sentence
            C[i, sent_length:, :] = 0
        return C

    def build_char_embedd_table():
        scale = np.sqrt(3.0 / char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype(
            theano.config.floatX)
        return char_embedd_table

    char_alphabet = Alphabet('character')
    char_alphabet.get_index(word_end)

    index_sentences_train, max_char_length_train = get_character_indexes(sentences_train)
    index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev)
    index_sentences_test, max_char_length_test = get_character_indexes(sentences_test)

    # close character alphabet
    char_alphabet.close()
    logger.info("character alphabet size: %d" % (char_alphabet.size() - 1))

    max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test))
    logger.info("Maximum character length of training set is %d" % max_char_length_train)
    logger.info("Maximum character length of dev set is %d" % max_char_length_dev)
    logger.info("Maximum character length of test set is %d" % max_char_length_test)
    logger.info("Maximum character length used for training is %d" % max_char_length)

    # fill character tensor
    C_train = construct_tensor_char(index_sentences_train)
    C_dev = construct_tensor_char(index_sentences_dev)
    C_test = construct_tensor_char(index_sentences_test)

    return C_train, C_dev, C_test, build_char_embedd_table()
Exemplo n.º 38
0
def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=1, label_column=4,
                                   label_name='pos', oov='embedding', fine_tune=False, embedding="word2Vec",
                                   embedding_path=None,
                                   use_character=False):
    """
    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param label_name: name of label, such as pos or ner
    :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and
                test data to alphabet; if "random", not.
    :param fine_tune: if fine tune word embeddings.
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :param use_character: if use character embeddings.
    :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test,
            embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table
    """

    def get_max_length(word_sentences):
        max_len = 0
        for sentence in word_sentences:
            length = len(sentence)
            if length > max_len:
                max_len = length
        return max_len

    def construct_tensor_fine_tune(word_index_sentences, label_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                label = label_ids[j]
                X[i, j] = wid
                Y[i, j] = label - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, Y, mask

    def build_embedd_table(embedd_dict, embedd_dim, caseless):
        scale = np.sqrt(3.0 / embedd_dim)
        embedd_table = np.empty([word_alphabet.size(), embedd_dim], dtype=theano.config.floatX)
        embedd_table[word_alphabet.default_index, :] = np.random.uniform(-scale, scale, [1, embedd_dim])
        for word, index in word_alphabet.iteritems():
            ww = word.lower() if caseless else word
            embedd = embedd_dict[ww] if ww in embedd_dict else np.random.uniform(-scale, scale, [1, embedd_dim])
            embedd_table[index, :] = embedd
        return embedd_table

    def generate_dataset_fine_tune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train,
                                                                  label_index_sentences_train)
        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev)
        X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)
        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               build_embedd_table(embedd_dict, embedd_dim, caseless), label_alphabet, \
               C_train, C_dev, C_test, char_embedd_table

    def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict,
                                       embedd_dim, caseless):
        X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX)
        Y = np.empty([len(word_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX)

        # bad_dict = dict()
        # bad_num = 0
        for i in range(len(word_sentences)):
            words = word_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(words)
            for j in range(length):
                word = words[j].lower() if caseless else words[j]
                label = label_ids[j]
                embedd = embedd_dict[word] if word in embedd_dict else unknown_embedd
                X[i, j, :] = embedd
                Y[i, j] = label - 1

                # if word not in embedd_dict:
                #     bad_num += 1
                #     if word in bad_dict:
                #         bad_dict[word] += 1
                #     else:
                #         bad_dict[word] = 1

            # Zero out X after the end of the sequence
            X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX)
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1

        # for w, c in bad_dict.items():
        #     if c >= 100:
        #         print "%s: %d" % (w, c)
        # print bad_num

        return X, Y, mask

    def generate_dataset_not_fine_tune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train,
                                                                      label_index_sentences_train, unknown_embedd,
                                                                      embedd_dict, embedd_dim, caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev,
                                                                unknown_embedd, embedd_dict, embedd_dim, caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test,
                                                                   unknown_embedd, embedd_dict, embedd_dim, caseless)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               None, label_alphabet, C_train, C_dev, C_test, char_embedd_table

    word_alphabet = Alphabet('word')
    label_alphabet = Alphabet(label_name)

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling(
        train_path, word_alphabet, label_alphabet, word_column, label_column)

    # if oov is "random" and do not fine tune, close word_alphabet
    if oov == "random" and not fine_tune:
        logger.info("Close word alphabet.")
        word_alphabet.close()

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling(
        dev_path, word_alphabet, label_alphabet, word_column, label_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling(
        test_path, word_alphabet, label_alphabet, word_column, label_column)

    # close alphabets
    word_alphabet.close()
    label_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    if fine_tune:
        logger.info("Generating data with fine tuning...")
        return generate_dataset_fine_tune()
    else:
        logger.info("Generating data without fine tuning...")
        return generate_dataset_not_fine_tune()
Exemplo n.º 39
0
class BinarySource( Source ):
	""" Source for binary classification data in following format:
	one example per line with feature-value pair separated by
	separator symbol (' ' by default). E.g.:

	1	f1:1.0 f2:1.0 f3:1.0
	-1	f2:1.0 f3:1.0 f8:1.0
	-1	f1:1.0 f2:1.0
	1	f8:1.0 f9:1.0 f10:1.0
	"""
	def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ):
		Source.__init__(self, data, encoding=encoding)
		self._Instance = BinaryClassificationInstance
		if feature_alphabet != None:
			self._feature_alphabet = feature_alphabet
		else:
			self._feature_alphabet = Alphabet(locked=False)
		self._sep = sep
		self._bias = bias
		self._bias_prefix = bias_prefix
		if alphabet_pop:
			self._populate_alphabet()
		if alphabet_lock:
			self.lock_alphabet()
		else:
			self.unlock_alphabet()
		return

	def _parse( self ):
		""" return parsed line """
		sep = self._sep
		for line in self._stream:
			line = line.rstrip()
			items = line.split()
			cl = items[0]
			assert cl in [POS_LAB, NEG_LAB]
			feats = []
			if self._bias:
				feats.append( (self._bias_prefix, 1.0) ) # implicit bias
			for s in items[1:]:
				try:
					f,v = s.rsplit(sep, 1)
					v = float(v)
					feats.append( (f,v) )
				except ValueError:
					sys.exit("Datasource error: make sure you use the right datasource format.")
			yield ( cl, feats )

	def _populate_alphabet( self ):
		print >> sys.stderr, "Populating feature alphabet...             ",
		self.unlock_alphabet()
		if self._stream_type == "generator":
			for i, gen_inst in enumerate(self._stream): # read stream directly
				sys.stderr.write("%s" %"\b"*len(str(i))+str(i))	
				featvals = gen_inst.get_featvals()
				for (f,_) in featvals:
					self._feature_alphabet.add(f)
		else:
			try:
				for tag,feats in self._parse():
					for f,_ in feats:
						self._feature_alphabet.add( f )
			except ValueError:
				sys.exit("Datasource error: make sure you use the right data format.")
			# rewind stream
		try:
			self.rewind()
		except TypeError:
			sys.exit("TypeError: make sure rewind() is used only on files.")
		print >> sys.stderr, " done."
		print >> sys.stderr, "Number of features: %s" %self._feature_alphabet.size()
		return

	def unlock_alphabet( self ):
		self._feature_alphabet.unlock()
		return

	def lock_alphabet( self ):
		self._feature_alphabet.lock()
		return

	def set_alphabet( self, feature_alphabet ):
		self._feature_alphabet = feature_alphabet
		return

	def get_alphabet( self ):
		return self._feature_alphabet

	def get_input( self ):
		for label,feats in self._parse():
			yield label, feats

	def __iter__( self ):
		""" instance generator """
		feature_alphabet = self._feature_alphabet
		assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!"
		if self._stream_type in ["file","list"]:
			for idx,(label,feats) in enumerate(self._parse()):
				if not feature_alphabet.locked(): # dynamic feature alphabet
					for (f,_) in feats:
						feature_alphabet.add(f)
				instance =  self._Instance(idx, label, feats, feature_alphabet)
				yield instance
		elif self._stream_type == "generator":
			for idx, gen_inst in enumerate(self._stream): # read stream directly
				featvals = gen_inst.get_featvals()
				label = gen_inst.get_label()
				if not feature_alphabet.locked(): # dynamic feature alphabet
					for (f,_) in featvals:
						feature_alphabet.add(f)
				instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet)
				yield instance

	def size( self ):
		s = len(list(self._stream))
		self.rewind()
		return s
Exemplo n.º 40
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)  # add labels to label domain
        # extract features of the target path
        F = x.F
        path = x.truth
        A.add_many(F(0, None, path[0]))
        A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t-1], path[t]))
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Exemplo n.º 41
0
  punct = set(string.punctuation)
  #stoplist.update(punct)

  # merge inputs to compute word frequencies
  _, ext = os.path.splitext(os.path.basename(train))
  all_fname = "/tmp/trec-merged" + ext
  files = ' '.join([train, dev, test])
  subprocess.call("/bin/cat {} > {}".format(files, all_fname), shell=True)
  unique_questions, qids, questions, answers, labels = load_data(all_fname, resample = False)

  docs = answers + unique_questions
  word2dfs = compute_dfs(docs)
  print word2dfs.items()[:10]

  # map words to ids
  alphabet = Alphabet(start_feature_id=0)
  alphabet.add('UNKNOWN_WORD_IDX')
  add_to_vocab(answers, alphabet)
  add_to_vocab(questions, alphabet)
  basename = os.path.basename(train)
  cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))
  print "alphabet size=", len(alphabet)

  # dump embedding file
  dummy_word_idx = alphabet.fid
  dump_embedding(outdir, 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin', alphabet)

  # summarize max sentense length
  q_max_sent_length = max(map(lambda x: len(x), questions))
  a_max_sent_length = max(map(lambda x: len(x), answers))
  print 'q_max_sent_length', q_max_sent_length
Exemplo n.º 42
0
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7,
                         embedding="word2Vec", embedding_path=None):
    """

    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param pos_column: the column index of pos (start from 0)
    :param head_column: the column index of head (start from 0)
    :param type_column: the column index of types (start from 0)
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :return: X_train, POS_train, Head_train, Type_train, mask_train,
             X_dev, POS_dev, Head_dev, Type_dev, mask_dev,
             X_test, POS_test, Head_test, Type_test, mask_test,
             embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table
    """

    def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            pos_ids = pos_index_sentences[i]
            heads = head_sentences[i]
            type_ids = type_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                pid = pos_ids[j]
                head = heads[j]
                tid = type_ids[j]
                X[i, j] = wid
                POS[i, j] = pid - 1
                Head[i, j] = head
                Type[i, j] = tid - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            POS[i, length:] = POS[i, length - 1]
            Head[i, length:] = Head[i, length - 1]
            Type[i, length:] = Type[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, POS, Head, Type, mask

    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \
    word_index_sentences_train, pos_index_sentences_train, \
    type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                    pos_column, head_column, type_column)

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \
    word_index_sentences_dev, pos_index_sentences_dev, \
    type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                  pos_column, head_column, type_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \
    word_index_sentences_test, pos_index_sentences_test, \
    type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                   pos_column, head_column, type_column)

    # close alphabets
    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1))
    logger.info("type alphabet size: %d" % (type_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                       logger)
    logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
    # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length])
    X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(word_index_sentences_train,
                                                                              pos_index_sentences_train,
                                                                              head_sentences_train,
                                                                              type_index_sentences_train)

    X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(word_index_sentences_dev,
                                                                    pos_index_sentences_dev,
                                                                    head_sentences_dev,
                                                                    type_index_sentences_dev)

    X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(word_index_sentences_test,
                                                                         pos_index_sentences_test,
                                                                         head_sentences_test,
                                                                         type_index_sentences_test)

    embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless)

    C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                        word_sentences_test, max_length)

    return X_train, POS_train, Head_train, Type_train, mask_train, \
           X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \
           X_test, POS_test, Head_test, Type_test, mask_test, \
           embedd_table, word_alphabet, pos_alphabet, type_alphabet, \
           C_train, C_dev, C_test, char_embedd_table