示例#1
0
def cell_vector_avg(cell, w2v_model):
    vector, n = np.zeros(w2v_model.vector_size), 0
    if not cell == 'NaN':
        ent_n = cell_phrase_preprocess(cell)
        tokenized_line = ' '.join(tokenize(ent_n))
        is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()]
        for i, word in enumerate(is_alpha_word_line):
            if word in w2v_model.wv.vocab:
                w_vec = w2v_model.wv[word]
                vector += w_vec.reshape(w2v_model.vector_size)
                n += 1
    return vector if n == 0 else vector / n
示例#2
0
 def __iter__(self):
     for root, dirs, files in os.walk(self.dirname):
         for filename in files:
             file_path = root + '/' + filename
             for line in open(file_path):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = preprocess.code2string(sline)
                 tokenized_line = ' '.join(tokenize(rline))
                 is_alpha_word_line = tokenized_line.lower().split()
                 yield is_alpha_word_line
示例#3
0
def cell_vector(cell, w2v_model, seq_size):
    vectors = np.zeros((seq_size, w2v_model.vector_size))
    if not cell == 'NaN':
        ent_n = cell_phrase_preprocess(cell)
        tokenized_line = ' '.join(tokenize(ent_n))
        is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()]
        for i, word in enumerate(is_alpha_word_line):
            if i >= seq_size:
                break
            if word in w2v_model.wv.vocab:
                w_vec = w2v_model.wv[word]
                vectors[i] = w_vec
    return vectors
示例#4
0
def Synth_Column_Encode_WV(micro_table, seq_size, w2v_model):
    D = w2v_model.vector_size
    emd = np.zeros((seq_size, 1, D))
    col_0 = micro_table['col_0']
    seq = list()
    for j, cell in enumerate(col_0):
        ent_n = cell_phrase_preprocess(cell)
        tokenized_line = ' '.join(tokenize(ent_n))
        seq += [word for word in tokenized_line.lower().split() if word.isalpha()]
    for j in range(seq_size):
        if j < len(seq):
            if seq[j] in w2v_model.wv.vocab:
                emd[j, 0, :] = w2v_model.wv[seq[j]]
    return emd
示例#5
0
def synthetic_columns2sequence(ent_units, sequence_size):
    word_seq = list()
    for ent in ent_units:
        ent_n = ent.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \
            replace('"', ' ').replace("'", ' ')
        tokenized_line = ' '.join(tokenize(ent_n))
        is_alpha_word_line = [
            word for word in tokenized_line.lower().split() if word.isalpha()
        ]
        word_seq += is_alpha_word_line
    if len(word_seq) >= sequence_size:
        return word_seq[0:sequence_size]
    else:
        return word_seq + ['NaN'] * (sequence_size - len(word_seq))
示例#6
0
 def __iter__(self):
     for root, dirs, files in os.walk(self.dirname):
         for filename in files:
             file_path = root + '/' + filename
             for line in open(file_path, encoding='utf-8'):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = cleanhtml(sline)
                 tokenized_line = ' '.join(tokenize(rline))
                 is_alpha_word_line = [
                     word for word in tokenized_line.lower().split()
                     if word.isalpha()
                 ]
                 yield is_alpha_word_line
示例#7
0
def to_lower_words(s):
    s = s.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \
        replace('"', ' ').replace("'", ' ').replace('\\', ' ').replace('(', ' ').replace(')', ' ')
    tokenized_line = ' '.join(tokenize(s))
    return [word for word in tokenized_line.lower().split() if word.isalpha()]