예제 #1
0
    def build_dataset(self, pos, neg):
        # Check if files exist
        if not (utils.check_file(pos) and utils.check_file(neg)):
            raise Exception("Files ", pos, " and ", neg, " do not exist")

        pos = list(open(pos, "r").readlines())
        neg = list(open(neg, "r").readlines())
        # TODO: Deprecate full clean
        if self.full_clean:
            pos = utils.full_clean(dataset=pos)
            neg = utils.full_clean(dataset=neg)
        # Create labels
        pos_labels = [[0, 1] for _ in pos]
        neg_labels = [[1, 0] for _ in neg]
        # Combine sets
        text = pos + neg
        labels = pos_labels + neg_labels
        # Build set
        for i, line in enumerate(text):
            text[i] = []
            line = line.strip().split(" ")
            for w in line:
                text[i].append(self.word_id(w))
            text[i] = [text[i], labels[i]]

        return np.array(text)
예제 #2
0
def word2vec(args):
    # Files
    data_dir = "./data/sets/"
    vector_dir = "./data/vectors/"
    file_names = ["a", "b", "c", "d1", "d2", "d3"]
    vectors = dict()
    all = []
    # Word2Vec model
    t0 = time()
    w2v = gensim.models.KeyedVectors.load_word2vec_format(args.dir, binary=True)
    print("Took ", time()-t0)

    for file in file_names:
        pos = list(open(data_dir+file+"_pos.txt", "r").readlines())
        neg = list(open(data_dir+file+"_neg.txt", "r").readlines())
        dset = pos + neg
        clean_dset = utils.full_clean(dset)
        dset += clean_dset
        del(clean_dset)
        for line in dset:
            line = line.strip().split(" ")
            for word in line:
                if (not word in vectors) and (word in w2v.wv.vocab):
                    vectors[word] = w2v[word]
                if not word in all:
                    all.append(word)

    print("Trimming finished, overall there are ", len(
        vectors), " vectors out of a total of ", len(all), " words.")

    np.savez(vector_dir+args.save, embeddings=vectors)
예제 #3
0
 def fill_dict():
     for file in file_names:
         pos = list(open(data_dir+file+"_pos.txt", "r").readlines())
         neg = list(open(data_dir+file+"_neg.txt", "r").readlines())
         dset = pos + neg
         clean_dset = utils.full_clean(dset)
         dset += clean_dset
         del(clean_dset)
         for line in dset:
             line = line.strip().split(" ")
             for word in line:
                 if (not word in vectors) and (word in glove):
                     vectors[word] = glove[word]
                 if not word in all:
                     all.append(word)
예제 #4
0
def glove(args):
    # Files
    data_dir = "./data/sets/"
    vector_dir = "./data/vectors/"
    file_names = ["a", "b", "c", "d1", "d2", "d3"]
    vectors = dict()
    all = []

    # Build a glove dict
    glove_file = list(open(args.dir, "r").readlines())
    glove = dict()
    # A glove.txt file has the format [word,float[0],...,float[dim]]
    for i, line in enumerate(glove_file):
        line = line.strip().split(" ")
        # line[0] contains the word and the remaining have the floats
        word = line[0]
        vector = [np.float32(x) for x in line[1:]]
        glove[word] = vector
        if i % 100000 == 0:
            print(i)

    print(len(glove))

    for file in file_names:
        pos = list(open(data_dir+file+"_pos.txt", "r").readlines())
        #neg = list(open(data_dir+file+"_neg.txt", "r").readlines())
        #dset = pos + neg
        dset = pos
        clean_dset = utils.full_clean(dset)
        dset += clean_dset
        del(clean_dset)
        for line in dset:
            line = line.strip().split(" ")
            for word in line:
                if (not word in vectors) and (word in glove):
                    vectors[word] = glove[word]
                if not word in all:
                    all.append(word)

    print("Trimming finished, overall there are ", len(vectors),
          " vectors out of a total of ", len(all), " words.")
예제 #5
0
    def build_vocabs(self):
        # Build the word vocab
        self.words = dict()

        # Assign the Unknown token
        self.words["<UNK>"] = 0

        # Build the character vocab
        if self.use_chars:
            self.chars = dict()
            # Assign the Unknown token
            self.chars["<UNK>"] = 0
        else:
            # Need to initialize the character vocab either way
            self.chars = None

        # If we have pretrained vectors, build them here
        if not self.pretrained == None:
            # Load the model
            print("USING: Pre-trained embedding vectors from %s" % self.pretrained)
            vectors = np.load(self.pretrained, allow_pickle=True)
            vectors = {key: vectors[key].item() for key in vectors}["embeddings"]
            # To prevent mistakes regarding embedding dimensions, take the
            # "first" value in the dict and check the embedding size
            self.dim_word = len(list(vectors.values())[0])
            # Embedding matrix
            self.wordvec_matrix = []
            # <UNK> vector
            self.wordvec_matrix.append(np.random.uniform(low=-0.25, high=0.25, size=self.dim_word))
        else:
            print("USING: Randomly initialized vectors")

        # Load and join positive and negative files
        pos = list(open(self.train["pos"], "r").readlines())
        neg = list(open(self.train["neg"], "r").readlines())
        dset = pos + neg

        # If using complete clean
        if self.full_clean:
            print("USING: Full clean")
            dset = utils.full_clean(dataset=dset)

        # If using dynamic padding
        if self.dynamic_pad:
            self.maxlen_sentence = 0
            print("USING: Dynamic padding")
        else:
            self.maxlen_sentence = len(max(dset, key=len).split(" "))
            print("USING: Padding is set to a maximum of ", self.maxlen_sentence)

        # Build the embedding matrix
        # word and character counter
        nw, nc = 1, 1
        for line in dset:
            line = line.strip().split(" ")
            for w in line:
                if not w in self.words:
                    self.words[w] = nw
                    # If using pre-trained vectors
                    if not self.pretrained == None:
                        # If word is in the embeddings dictionary
                        if w in vectors:
                            self.wordvec_matrix.append(vectors[w])
                        else:
                            self.wordvec_matrix.append(np.random.uniform(
                                low=-0.25, high=0.25, size=self.dim_word))
                    nw += 1
                    if self.use_chars:
                        for c in w:
                            if not c in self.chars:
                                self.chars[c] = nc
                                nc += 1
예제 #6
0
        coin_table = table_container.find("div", class_="coingecko-table")
        # move through all the nested divs to actual table.
        coin_table = coin_table.div.div.table

        # table headers
        table_head = coin_table.thead

        table_head_row = table_head.tr
        table_headers = table_head_row.find_all("th")

        for header in table_headers:
            header_text = header.text
            if header_text != "":
                # skip over blank header
                header_text = full_clean(header_text)
                # remove new lines from each string
                headers.append(header_text)

        # print(headers)
        with open(file_name, "w") as crypto_file:
            csv_writer = csv.DictWriter(crypto_file,
                                        fieldnames=headers,
                                        delimiter="\t")
            # write the heaader
            csv_writer.writeheader()

        # table content
        table_body = coin_table.tbody
        table_body_rows = table_body.find_all("tr")