def load_data_word2vec(dirFolder, featureKeepRatio=1.0): model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] def getIt(dablu): try: return model[dablu] except: return np.zeros((300,)) for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = list(set(processedFile)) features = [ getIt(w) for w in vocabulary] X.append(features) return np.stack(X)
def load_data(dirFolder, testRatio, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i: classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) splitPoint = int(testRatio * len(documents)) trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint] allDocs.append([trainDocs, testDocs]) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X_train, Y_train = [], [] X_test, Y_test = [], [] for i, dclass in enumerate(classes): for j in range(len(allDocs[i])): for doc in allDocs[i][j]: processedFile = preprocess.readFile( os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [words.get(w, 0) for w in vocabulary] if j == 0: X_train.append(features) Y_train.append(i) else: X_test.append(features) Y_test.append(i) return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
def load_data(dirFolder, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [ words.get(w, 0) for w in vocabulary] X.append(features) return np.stack(X)