def cell_vector_avg(cell, w2v_model): vector, n = np.zeros(w2v_model.vector_size), 0 if not cell == 'NaN': ent_n = cell_phrase_preprocess(cell) tokenized_line = ' '.join(tokenize(ent_n)) is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()] for i, word in enumerate(is_alpha_word_line): if word in w2v_model.wv.vocab: w_vec = w2v_model.wv[word] vector += w_vec.reshape(w2v_model.vector_size) n += 1 return vector if n == 0 else vector / n
def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filename for line in open(file_path): sline = line.strip() if sline == "": continue rline = preprocess.code2string(sline) tokenized_line = ' '.join(tokenize(rline)) is_alpha_word_line = tokenized_line.lower().split() yield is_alpha_word_line
def cell_vector(cell, w2v_model, seq_size): vectors = np.zeros((seq_size, w2v_model.vector_size)) if not cell == 'NaN': ent_n = cell_phrase_preprocess(cell) tokenized_line = ' '.join(tokenize(ent_n)) is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()] for i, word in enumerate(is_alpha_word_line): if i >= seq_size: break if word in w2v_model.wv.vocab: w_vec = w2v_model.wv[word] vectors[i] = w_vec return vectors
def Synth_Column_Encode_WV(micro_table, seq_size, w2v_model): D = w2v_model.vector_size emd = np.zeros((seq_size, 1, D)) col_0 = micro_table['col_0'] seq = list() for j, cell in enumerate(col_0): ent_n = cell_phrase_preprocess(cell) tokenized_line = ' '.join(tokenize(ent_n)) seq += [word for word in tokenized_line.lower().split() if word.isalpha()] for j in range(seq_size): if j < len(seq): if seq[j] in w2v_model.wv.vocab: emd[j, 0, :] = w2v_model.wv[seq[j]] return emd
def synthetic_columns2sequence(ent_units, sequence_size): word_seq = list() for ent in ent_units: ent_n = ent.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \ replace('"', ' ').replace("'", ' ') tokenized_line = ' '.join(tokenize(ent_n)) is_alpha_word_line = [ word for word in tokenized_line.lower().split() if word.isalpha() ] word_seq += is_alpha_word_line if len(word_seq) >= sequence_size: return word_seq[0:sequence_size] else: return word_seq + ['NaN'] * (sequence_size - len(word_seq))
def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filename for line in open(file_path, encoding='utf-8'): sline = line.strip() if sline == "": continue rline = cleanhtml(sline) tokenized_line = ' '.join(tokenize(rline)) is_alpha_word_line = [ word for word in tokenized_line.lower().split() if word.isalpha() ] yield is_alpha_word_line
def to_lower_words(s): s = s.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \ replace('"', ' ').replace("'", ' ').replace('\\', ' ').replace('(', ' ').replace(')', ' ') tokenized_line = ' '.join(tokenize(s)) return [word for word in tokenized_line.lower().split() if word.isalpha()]