def build_dataset(self, pos, neg): # Check if files exist if not (utils.check_file(pos) and utils.check_file(neg)): raise Exception("Files ", pos, " and ", neg, " do not exist") pos = list(open(pos, "r").readlines()) neg = list(open(neg, "r").readlines()) # TODO: Deprecate full clean if self.full_clean: pos = utils.full_clean(dataset=pos) neg = utils.full_clean(dataset=neg) # Create labels pos_labels = [[0, 1] for _ in pos] neg_labels = [[1, 0] for _ in neg] # Combine sets text = pos + neg labels = pos_labels + neg_labels # Build set for i, line in enumerate(text): text[i] = [] line = line.strip().split(" ") for w in line: text[i].append(self.word_id(w)) text[i] = [text[i], labels[i]] return np.array(text)
def word2vec(args): # Files data_dir = "./data/sets/" vector_dir = "./data/vectors/" file_names = ["a", "b", "c", "d1", "d2", "d3"] vectors = dict() all = [] # Word2Vec model t0 = time() w2v = gensim.models.KeyedVectors.load_word2vec_format(args.dir, binary=True) print("Took ", time()-t0) for file in file_names: pos = list(open(data_dir+file+"_pos.txt", "r").readlines()) neg = list(open(data_dir+file+"_neg.txt", "r").readlines()) dset = pos + neg clean_dset = utils.full_clean(dset) dset += clean_dset del(clean_dset) for line in dset: line = line.strip().split(" ") for word in line: if (not word in vectors) and (word in w2v.wv.vocab): vectors[word] = w2v[word] if not word in all: all.append(word) print("Trimming finished, overall there are ", len( vectors), " vectors out of a total of ", len(all), " words.") np.savez(vector_dir+args.save, embeddings=vectors)
def fill_dict(): for file in file_names: pos = list(open(data_dir+file+"_pos.txt", "r").readlines()) neg = list(open(data_dir+file+"_neg.txt", "r").readlines()) dset = pos + neg clean_dset = utils.full_clean(dset) dset += clean_dset del(clean_dset) for line in dset: line = line.strip().split(" ") for word in line: if (not word in vectors) and (word in glove): vectors[word] = glove[word] if not word in all: all.append(word)
def glove(args): # Files data_dir = "./data/sets/" vector_dir = "./data/vectors/" file_names = ["a", "b", "c", "d1", "d2", "d3"] vectors = dict() all = [] # Build a glove dict glove_file = list(open(args.dir, "r").readlines()) glove = dict() # A glove.txt file has the format [word,float[0],...,float[dim]] for i, line in enumerate(glove_file): line = line.strip().split(" ") # line[0] contains the word and the remaining have the floats word = line[0] vector = [np.float32(x) for x in line[1:]] glove[word] = vector if i % 100000 == 0: print(i) print(len(glove)) for file in file_names: pos = list(open(data_dir+file+"_pos.txt", "r").readlines()) #neg = list(open(data_dir+file+"_neg.txt", "r").readlines()) #dset = pos + neg dset = pos clean_dset = utils.full_clean(dset) dset += clean_dset del(clean_dset) for line in dset: line = line.strip().split(" ") for word in line: if (not word in vectors) and (word in glove): vectors[word] = glove[word] if not word in all: all.append(word) print("Trimming finished, overall there are ", len(vectors), " vectors out of a total of ", len(all), " words.")
def build_vocabs(self): # Build the word vocab self.words = dict() # Assign the Unknown token self.words["<UNK>"] = 0 # Build the character vocab if self.use_chars: self.chars = dict() # Assign the Unknown token self.chars["<UNK>"] = 0 else: # Need to initialize the character vocab either way self.chars = None # If we have pretrained vectors, build them here if not self.pretrained == None: # Load the model print("USING: Pre-trained embedding vectors from %s" % self.pretrained) vectors = np.load(self.pretrained, allow_pickle=True) vectors = {key: vectors[key].item() for key in vectors}["embeddings"] # To prevent mistakes regarding embedding dimensions, take the # "first" value in the dict and check the embedding size self.dim_word = len(list(vectors.values())[0]) # Embedding matrix self.wordvec_matrix = [] # <UNK> vector self.wordvec_matrix.append(np.random.uniform(low=-0.25, high=0.25, size=self.dim_word)) else: print("USING: Randomly initialized vectors") # Load and join positive and negative files pos = list(open(self.train["pos"], "r").readlines()) neg = list(open(self.train["neg"], "r").readlines()) dset = pos + neg # If using complete clean if self.full_clean: print("USING: Full clean") dset = utils.full_clean(dataset=dset) # If using dynamic padding if self.dynamic_pad: self.maxlen_sentence = 0 print("USING: Dynamic padding") else: self.maxlen_sentence = len(max(dset, key=len).split(" ")) print("USING: Padding is set to a maximum of ", self.maxlen_sentence) # Build the embedding matrix # word and character counter nw, nc = 1, 1 for line in dset: line = line.strip().split(" ") for w in line: if not w in self.words: self.words[w] = nw # If using pre-trained vectors if not self.pretrained == None: # If word is in the embeddings dictionary if w in vectors: self.wordvec_matrix.append(vectors[w]) else: self.wordvec_matrix.append(np.random.uniform( low=-0.25, high=0.25, size=self.dim_word)) nw += 1 if self.use_chars: for c in w: if not c in self.chars: self.chars[c] = nc nc += 1
coin_table = table_container.find("div", class_="coingecko-table") # move through all the nested divs to actual table. coin_table = coin_table.div.div.table # table headers table_head = coin_table.thead table_head_row = table_head.tr table_headers = table_head_row.find_all("th") for header in table_headers: header_text = header.text if header_text != "": # skip over blank header header_text = full_clean(header_text) # remove new lines from each string headers.append(header_text) # print(headers) with open(file_name, "w") as crypto_file: csv_writer = csv.DictWriter(crypto_file, fieldnames=headers, delimiter="\t") # write the heaader csv_writer.writeheader() # table content table_body = coin_table.tbody table_body_rows = table_body.find_all("tr")