def create_pkz(target_dir, source_dir):
    """
    Used to create pkz file - this allows to work with this data in offline mode (pkz is as a zipped pickle)
    Args:
        source_dir: Location of the .tar.gz file
        target_dir: Location to save pkz file

    """

    DOWNLOADED_NAME = "20news-bydate.tar.gz"
    CACHE_NAME = "20news-bydate_py3.pkz"
    TRAIN_FOLDER = "20news-bydate-train"
    TEST_FOLDER = "20news-bydate-test"

    cache_path = os.path.join(target_dir, CACHE_NAME)

    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    tarfile.open(source_dir + DOWNLOADED_NAME,
                 "r:gz").extractall(path=target_dir)

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))
    compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
    with open(cache_path, 'wb') as f:
        f.write(compressed_content)
Exemplo n.º 2
0
def download_classic(target_dir, cache_path):
    """Download the 20 newsgroups data and stored it as a zipped pickle."""
    archive_path = os.path.join(target_dir, ARCHIVE_NAME)
    # all_path = os.path.join(target_dir, ALL_FOLDER)
    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    if os.path.exists(archive_path):
        # Download is not complete as the .tar.gz file is removed after
        # download.
        logger.warning("Download was incomplete, downloading again.")
        os.remove(archive_path)

    logger.warning("Downloading dataset from %s (1.5 MB)", URL)
    opener = urlopen(URL)
    with open(archive_path, 'wb') as f:
        f.write(opener.read())

    logger.info("Decompressing %s", archive_path)
    tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
    os.remove(archive_path)

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))
    # cache = dict(all=load_files(all_path, encoding='latin1'))
    compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
    with open(cache_path, 'wb') as f:
        f.write(compressed_content)

    shutil.rmtree(target_dir)
    return cache
Exemplo n.º 3
0
def download_classic(target_dir, cache_path):
    """Download the 20 newsgroups data and stored it as a zipped pickle."""
    archive_path = os.path.join(target_dir, ARCHIVE_NAME)
    # all_path = os.path.join(target_dir, ALL_FOLDER)
    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    if os.path.exists(archive_path):
        # Download is not complete as the .tar.gz file is removed after
        # download.
        logger.warning("Download was incomplete, downloading again.")
        os.remove(archive_path)

    logger.warning("Downloading dataset from %s (1.5 MB)", URL)
    opener = urlopen(URL)
    with open(archive_path, 'wb') as f:
        f.write(opener.read())

    logger.info("Decompressing %s", archive_path)
    tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
    os.remove(archive_path)

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))
    # cache = dict(all=load_files(all_path, encoding='latin1'))
    compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
    with open(cache_path, 'wb') as f:
        f.write(compressed_content)

    shutil.rmtree(target_dir)
    return cache
Exemplo n.º 4
0
    def get_cache(self, target_path):
        train_path = os.path.join(target_path, self.train_folder)
        test_path = os.path.join(target_path, self.test_folder)

        if not os.path.exists(target_path):
            os.makedirs(target_path)

        if not os.path.exists(train_path):
            os.makedirs(train_path)

        if not os.path.exists(test_path):
            os.makedirs(test_path)

        cache = dict(train=base.load_files(train_path, encoding='utf-8'),
                     test=base.load_files(test_path, encoding='utf-8'))

        # Turn textual instances representation into text the object structure
        instances = list()
        for instance in cache['train'].data:
            instances.append(WixInstance(instance))
        cache['train'].data = instances

        instances = list()
        for instance in cache['test'].data:
            instances.append(WixInstance(instance))
        cache['test'].data = instances

        compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')

        with open(self.cache_path, 'wb') as f:
            f.write(compressed_content)

        shutil.rmtree(target_path)

        return cache
    def make_cache_sample(self, train_path, test_path, cache_path):
        # Store a zipped pickle
        cache = dict(train=base.load_files(train_path, encoding='utf-8'),
                     test=base.load_files(test_path, encoding='utf-8'))

        compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
        with open(cache_path, 'wb') as f:
            f.write(compressed_content)
    def set_category(self):
        news_train = base.load_files("news")

        self.categories = [
            'dunya', 'ekonomi', 'kultur-sanat', 'magazin', 'saglik', 'siyaset',
            'spor', 'teknoloji', 'turkiye', 'yasam'
        ]

        text_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf',
             SGDClassifier(loss='hinge',
                           penalty='l2',
                           alpha=1e-3,
                           n_iter=5,
                           random_state=42)),
        ])

        parameters = {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__alpha': (1e-2, 1e-3)
        }

        gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
        gs_clf = gs_clf.fit(news_train.data[:1000], news_train.target[:1000])

        self.category = news_train.target_names[gs_clf.predict([self.data])[0]]
Exemplo n.º 7
0
    def get_cache(self, train_path, test_path, cache_path):
        cache = dict(train=base.load_files(train_path, encoding='utf-8'),
                     test=base.load_files(test_path, encoding='utf-8'))

        # turn tweet text to Tweet objects.
        train_tweets = list()
        for tweet in cache['train'].data:
            try:
                tweet = tweet.encode('utf-8')
                if tweet != '':
                    train_tweets.append(Tweet(tweet,self._textOnlyTweets))
            except UnicodeEncodeError as unicode_error:
                print unicode_error.message
        cache['train'].data = train_tweets

        test_tweets = list()
        for tweet in cache['test'].data:
            try:
                tweet = tweet.encode('utf-8')
                if tweet != '':
                    test_tweets.append(Tweet(tweet, self._textOnlyTweets))
            except UnicodeEncodeError as unicode_error:
                print unicode_error.message
        cache['test'].data = test_tweets

        compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')

        if not os.path.exists(cache_path.replace(self.cache_name, '')):
            os.makedirs(cache_path.replace(self.cache_name, ''))

        with open(cache_path, 'wb') as f:
            f.write(compressed_content)

        if not os.path.exists(cache_path):
            os.makedirs(cache_path)

        # TODO: When the cache is ready, need to delete all generated files
        # shutil.rmtree(cache_path)

        return cache
Exemplo n.º 8
0
def _download_20newsgroups(target_dir, cache_path):
    """Download the 20 newsgroups data and stored it as a zipped pickle."""
    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url)
    archive_path = _fetch_remote(ARCHIVE, dirname=target_dir)
    archive_path = 'C:/Users/Jackie/scikit_learn_data/20news_home/20news-bydate.tar.gz'
    logger.debug("Decompressing %s", archive_path)
    tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
    # os.remove(archive_path)# do not remove the file package

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))
    compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
    with open(cache_path, 'wb') as f:
        f.write(compressed_content)

    # shutil.rmtree(target_dir) #do not remove all the source files and directories
    return cache
Exemplo n.º 9
0
def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
    if set_ is not None:
        dataset_path = os.path.join(dataset_path, set_)
    return load_files(dataset_path, metadata.get('description'), **kwargs)
Exemplo n.º 10
0
def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
    if set_ is not None:
        dataset_path = os.path.join(dataset_path, set_)
    return load_files(dataset_path, metadata.get('description'), **kwargs)
Exemplo n.º 11
0
import io
import numpy as np
"""
train datasını trainetme_traine dosya pathını veya ismini direk vererek category category datalarını okuyor



data = open("1.txt").read()

bu kodda ise deneme datasını yüklüyorsun
print(trainetme_train.target_names[gs_clf.predict([data])[0]])
bu kod ise categoryriyi print ettiriyor

"""
#categories = ['spor','teknoloji']
trainetme_train = base.load_files("news")
'''
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
 ])
'''
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',
     SGDClassifier(loss='hinge',
                   penalty='l2',
                   alpha=1e-3,
                   n_iter=5,
                   random_state=42)),
if opts.filtered:
    remove = ('headers', 'footers', 'quotes')
else:
    remove = ()

print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")
"""
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)"""
data_train = base.load_files("news")
data_test = base.load_files("test")
data = data_train


def fetchdata(data, categories):
    labels = [(data.target_names.index(cat), cat) for cat in categories]
    # Sort the categories to have the ordering of the labels
    labels.sort()
    labels, categories = zip(*labels)
    mask = np.in1d(data.target, labels)
    data.filenames = data.filenames[mask]
    data.target = data.target[mask]
    # searchsorted to have continuous labels
    data.target = np.searchsorted(labels, data.target)
    data.target_names = list(categories)