def prepare_data(load_test_data=False): pd_data = load_data(TRAIN_PATH) vector = TfidfVectorizer("english") #vector = CountVectorizer() feature_matrics = vector.fit_transform(pd_data['question_text'].values.astype('U')) print('prepped train') # shuffle=False means pick the last 20% as dev data set. if load_test_data: test_data = load_data(TEST_PATH) test_feature_matrics = vector.transform(test_data['question_text'].values.astype('U')) return feature_matrics, test_feature_matrics, pd_data['target'], test_data else: return train_test_split(feature_matrics, pd_data['target'], test_size=0.2, shuffle=False)
def read(self): circuits = [] # circs = [ # "c6288", # "c5315", # "c432", # "c499", # "c880", # "c1355", # "c1908", # "c3540", # "adder.bench", # "arbiter.bench", # "cavlc.bench", # "dec.bench", # "voter.bench", # "sin.bench", # "priority.bench", # ] path = self.path if self.path else "../data/output" circs = self.circs if self.circs else [] for circ in circs: A, X, labels = load_data(circ, path, normalize="") circuits.append(Graph(x=X.toarray(), a=A, y=labels)) print(f"{circ}: {sum(labels)}, {len(labels)}") return circuits
def get_data(): return load_data(TRAIN_PATH)