import json

from predictocite.datasets.citation_groups import fetch_citationgroups


import numpy as np
import pandas as pd
from sklearn import cross_validation, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB




articles = fetch_citationgroups()

#STEP 1: Split data
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
	articles.data, articles.target, test_size=0.25, random_state=25)

#STEP 2: Extract features from text using TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df=1, stop_words='english', ngram_range=(1, 2), encoding='utf-8', max_features=50000)

"""
fit_transform learns the vocabulary dictionary
and return term-document matrix

"""
	def setUp(self):
		groups = ['zero_citations', 'one_to_five_citations']
		self.articles = fetch_citationgroups(groups)
	def setUp(self):
		self.groups = ['one_to_five_citations']
		self.articles = fetch_citationgroups(self.groups)
		preprocessor = TextPreprocessor(self.articles)
		split_data = preprocessor.split_data()
	def test_fetch_all_citationgroups(self):
		self.all_articles = fetch_citationgroups() 
		self.assertTrue(self.all_articles)