示例#1
0
def word_to_extra(words_data, w, w_range, h_size):
    wd_extra = []
    wpresent = []
    for iword in range(len(words_data)):

        if words_data[iword] == -2 and not words_data[iword]:
            continue

        h_index, h_vect, wp, Word_ids, w_size, dep_order, hh_index, p = wd_preprocess(
            words_data[iword])
        if h_index == -1:
            continue

        cflag = 0
        for i in wp:
            if len(i) > abs(w_range / 2):
                cflag = 1
                break
            for j in i:
                wpresent.append(j)
                if type(w[j]) != np.ndarray:
                    w[j] = init_weight(h_size, h_size)
                    Global.m[j] = zero_weight(h_size, h_size)
                    Global.v[j] = zero_weight(h_size, h_size)
                    Global.lr[j] = neta

        if cflag == 1:
            continue
        wpresent = list(set(wpresent))
        Word_vects = []
        try:
            for i in sorted(words_data[iword]):
                Word_vects.append(
                    get_word_vect(words_data[iword][i]['word'].lower(),
                                  Global.v_size))
        except KeyError:
            continue

        wd_extra.append({
            'w_size': w_size,
            'h_index': h_index,
            'h_vect': h_vect,
            'Word_vects': Word_vects,
            'wp': wp,
            "dep_order": dep_order,
            "hh_index": hh_index,
            'Word_ids': Word_ids,
            'p': p
        })
    return wd_extra, wpresent
示例#2
0
def get_vect_by_wd_dep(flag, words_data, mtype='normal'):
    if flag == 't':
        Word_ids = get_words_id(words_data)
        Word_vects = []
        for i in sorted(Word_ids):
            Word_vects.append(
                preprocess(
                    get_word_vect(words_data[i]['word'].lower(),
                                  Global.v_size)))
        w_size = len(Word_ids)
        p = get_parents(words_data)
        d = get_dep(words_data)
        dep_order, d1 = pdep_2_deporder_dep(p, d)
        h_index, h_vect, wp, _ = dep_2_hid_var(p, dep_order, d1, Word_ids)
        # Word_vects = get_words_vect(words_data, Word_ids, Global.v_size)
        vect = Word_vects + [None for i in range(len(h_vect))]
        del Word_vects
        w = pickle.load(open(Global.wfname, 'rb'))
        if mtype == 'normal':
            RAE_adam_herical.rae_encoding(vect=vect,
                                          w=w,
                                          w_size=w_size,
                                          h_vect=h_vect,
                                          wp=wp)
        elif mtype == 'deep':
            RAE_adam_herical_deep1.rae_encoding(vect=vect,
                                                w=w,
                                                w_size=w_size,
                                                h_vect=h_vect,
                                                wp=wp)
        chunks = {}
        chunks_vect = {}
        for i in range(len(Word_ids)):
            chunks[i] = words_data[Word_ids[i]]['word']
            chunks_vect[i] = vect[i]
        rev_h_index = {v: k for k, v in h_index.items()}
        count = i + 1
        for i in h_vect:
            if len(i) > 1:
                chunks_vect[count] = vect[h_index[Word_ids[min(i)]]]
                chunks[count] = ' '.join([
                    words_data[rev_h_index[j]]['word']
                    if j >= len(Word_ids) else words_data[Word_ids[j]]['word']
                    for j in i
                ])
                count += 1
        return chunks, chunks_vect
示例#3
0
def get_chk_vect_by_wd(flag, words_data, mtype='normal'):
    if flag == 't':
        Word_ids = get_words_id(words_data)
        Word_vects = []
        for i in sorted(Word_ids):
            Word_vects.append(
                preprocess(
                    get_word_vect(words_data[i]['word'].lower(),
                                  Global.v_size)))
        w_size = len(Word_ids)
        p = get_parents(words_data)
        d = get_dep(words_data)
        dep_order, d1 = pdep_2_deporder_dep(p, d)
        h_index, h_vect, wp, _ = dep_2_hid_var(p, dep_order, d1, Word_ids)
        # Word_vects = get_words_vect(words_data, Word_ids, Global.v_size)
        vect = Word_vects + [None for i in range(len(h_vect))]
        del Word_vects
        w = pickle.load(open(Global.wfname, 'rb'))
        if mtype == 'normal':
            RAE_adam_herical.rae_encoding(vect=vect,
                                          w=w,
                                          w_size=w_size,
                                          h_vect=h_vect,
                                          wp=wp)
        elif mtype == 'deep':
            RAE_adam_herical_deep1.rae_encoding(vect=vect,
                                                w=w,
                                                w_size=w_size,
                                                h_vect=h_vect,
                                                wp=wp)
        chks = get_chunks(words_data)
        chks_main = get_chunk_main(chks, dep_order)
        chunks = {}
        chunks_vect = {}
        for c in range(len(chks)):
            chunks[c] = ' '.join([words_data[i]['word'] for i in chks[c]])
        del words_data
        for c in range(len(chks_main)):
            ind = h_index[chks_main[c]]
            chunks_vect[c] = vect[ind]
        return chunks, chunks_vect
示例#4
0
def get_chk_vect(flag, line, mtype='normal'):
    if flag == 't':
        line = line_processing(line)
        # print line
        words_data = extract_feature_using_senna(line)
        p = get_parents(words_data)
        d = get_dep(words_data)
        Word_ids = get_words_id(words_data)
        for i in Word_ids:
            words_data[i]['vect'] = preprocess(
                get_word_vect(words_data[i]['word'].lower(), Global.v_size))
        w_size = len(Word_ids)
        dep_order, d1 = pdep_2_deporder_dep(p, d)
        h_index, h_vect, wp, _ = dep_2_hid_var(p, dep_order, d1, Word_ids)
        Word_vects = get_words_vect(words_data, Word_ids, Global.v_size)
        vect = Word_vects + [None for i in h_vect]
        del Word_vects
        w = pickle.load(open(Global.wfname, 'rb'))
        if mtype == 'normal':
            RAE_adam_herical.rae_encoding(vect=vect,
                                          w=w,
                                          w_size=w_size,
                                          h_vect=h_vect,
                                          wp=wp)
        elif mtype == 'deep':
            RAE_adam_herical_deep1.rae_encoding(vect=vect,
                                                w=w,
                                                w_size=w_size,
                                                h_vect=h_vect,
                                                wp=wp)
        chks = get_chunks_by_dep(Word_ids, h_index, h_vect)
        chunks = {}
        chunks_vect = {}
        count = 0
        order = get_order(d1, w_size)
        for m in order:
            chunks[count] = ' '.join([words_data[i]['word'] for i in chks[m]])
            chunks_vect[count] = vect[h_index[m]]
            count += 1
        return chunks, chunks_vect
示例#5
0
        cflag = 0
        # for i in wp:
        #     if len(i) > abs(w_range / 2):
        #         cflag = 1
        #         break
        #     for j in i:
        #         if type(w[j]) != np.ndarray:
        #             w[j] = init_weight(h_size)
        # if cflag == 1:
        #     continue

        Word_vects = []
        for i in sorted(words_data[iword]):
            Word_vects.append(
                get_word_vect(words_data[iword][i]['word'].lower(),
                              Global.v_size))

        wd_extra.append({
            'w_size': w_size,
            'h_index': h_index,
            'h_vect': h_vect,
            'Word_vects': Word_vects,
            'wp': wp,
            "dep_order": dep_order,
            "hh_index": hh_index,
            'Word_ids': Word_ids,
            'p': p
        })
    for i in w:
        if type(w[i]) == np.ndarray:
            w1[i] = w[i].copy() + e