Exemplo n.º 1
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
Exemplo n.º 2
0
def load_data():
    """
    Load the Reuters dataset.

    Returns
    -------
    train_docs, train_labels, test_docs, test_labels.
    """
    documents = reuters.fileids()
    train = [d for d in documents if d.startswith('training/')]
    train_docs = [reuters.raw(doc_id) for doc_id in train]
    train_docs = [text_prepare(x) for x in train_docs]
    train_labels = [reuters.categories(doc_id) for doc_id in train]

    test = [d for d in documents if d.startswith('test/')]
    test_docs = [reuters.raw(doc_id) for doc_id in test]
    test_docs = [text_prepare(x) for x in test_docs]
    test_labels = [reuters.categories(doc_id) for doc_id in test]

    print("len(train_docs)={}, len(train_labels)={}".format(
        len(train_docs), len(train_labels)))
    print("len(test_docs)={}, len(test_labels)={}".format(
        len(test_docs), len(test_labels)))

    mlb = MultiLabelBinarizer(classes=sorted(labels))
    train_labels = mlb.fit_transform(train_labels)
    test_labels = mlb.fit_transform(test_labels)
    print("y_train.shape={}, y_test.shape={}".format(train_labels.shape,
                                                     test_labels.shape))

    return (train_docs, train_labels, test_docs, test_labels, mlb.classes)
Exemplo n.º 3
0
def getDocIDs_top10():
    # Top 10 Categories
    documents = [
        f for f in reuters.fileids() if len(reuters.categories(fileids=f)) == 1
    ]
    train_docs_id = list(
        filter(
            lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51,
            documents))
    test_docs_id = list(
        filter(
            lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51,
            documents))
    new_train_docs_id = []
    new_test_docs_id = []
    for cat in reuters.categories():
        li = [f for f in reuters.fileids(categories=cat) if f in train_docs_id]
        li_te = [
            f for f in reuters.fileids(categories=cat) if f in test_docs_id
        ]
        if len(li) > 20 and len(li_te) > 20:
            new_train_docs_id.extend(li)
            new_test_docs_id.extend(li_te)
    train_docs_id = new_train_docs_id
    test_docs_id = new_test_docs_id
    return (train_docs_id, test_docs_id)
Exemplo n.º 4
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform(
        [reuters.categories(doc_id) for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test])
    data = {
        'x_train': xs['train'],
        'y_train': ys['train'],
        'x_test': xs['test'],
        'y_test': ys['test'],
        'labels': globals()["labels"]
    }
    return data
Exemplo n.º 5
0
    def __init__(self, corpusData):
        # getting all the document ids in corpusData, such as reuters
        docs = corpusData.fileids()

        # splitting into training and test docs ids
        self.train_docs_ids = list(
            filter(lambda doc: doc.startswith("train"), docs))
        self.test_docs_ids = list(
            filter(lambda doc: doc.startswith("test"), docs))

        # getting the actual data from those ids
        self.train_docs = [
            corpusData.raw(doc_id) for doc_id in self.train_docs_ids
        ]
        self.test_docs = [
            corpusData.raw(doc_id) for doc_id in self.test_docs_ids
        ]

        self.docs = self.train_docs + self.test_docs

        # transforming multilabels
        mlb = MultiLabelBinarizer()
        self.train_labels = mlb.fit_transform(
            [reuters.categories(doc_id) for doc_id in self.train_docs_ids])
        self.test_labels = mlb.transform(
            [reuters.categories(doc_id) for doc_id in self.test_docs_ids])

        #vectorizers
        self.count_vectorizer = CountVectorizer(analyzer='word',
                                                stop_words='english')
        self.tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                                stop_words='english',
                                                token_pattern='[A-Za-z]{3,}')
Exemplo n.º 6
0
def load_data(config={}):

    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words, binary=True)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform(
        [reuters.categories(doc_id) for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test])
    data = {
        'x_train': xs['train'],
        'y_train': ys['train'],
        'x_test': xs['test'],
        'y_test': ys['test'],
        'labels': reuters.categories()
    }
    print(data['x_train'])
    print(data['y_train'])
    return data, vectorizer.vocabulary_
    def categorize_kNNSimScoring(testDocs,
                                 trainDocs,
                                 docId,
                                 neighbors,
                                 fr=0.75):
        """
		Uses a similarity scoring approach for categorizing the test docs. With the nearest neighbors,
		for each class we assign a score based on which neighboring nodes have that class and assign weightings based 
		on the cosine similarity.
		"""

        #input is a test doc (assume nonempty or else the fr part in line 446 (second to last) will cause issues)

        #O(n^3)
        categories = reuters.categories(neighbors)
        catgScores = []
        for c in categories:
            score = 0.0
            for n in neighbors:
                if c in reuters.categories(n):
                    score += classifierTraining.dot(testDocs[docId],
                                                    trainDocs[n].doc_vec)
            catgScores.append((score, c))
        catgScores = heapq.nlargest(
            int(round(len(categories) * fr)),
            catgScores)  #should check if round function is O(1) or O(n)
        return [c for s, c in catgScores]
Exemplo n.º 8
0
def get_raw_data():
    
    nltk.download("reuters")
    from nltk.corpus import reuters
    
    documents = reuters.fileids()
    train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                                documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                               documents))
    X_train = [(reuters.raw(doc_id)) for doc_id in train_docs_id]
    X_test = [(reuters.raw(doc_id)) for doc_id in test_docs_id]


    mlb = MultiLabelBinarizer()
    y_train = [reuters.categories(doc_id)
                                 for doc_id in train_docs_id]
    y_test = [reuters.categories(doc_id)
                            for doc_id in test_docs_id]

    all_dataa     =    X_train +  X_test
    all_lavelsa   =    y_train +  y_test



    mlb = MultiLabelBinarizer()
    datas_y = mlb.fit_transform(all_lavelsa)
    
    return all_dataa,all_lavelsa
Exemplo n.º 9
0
    def __init__(self):
        self.documents = []
        self.categories = reuters.categories()
        self.tfidf = tf_idf()

        #iteracja po wszystkich dokumentach reuters
        for docid in reuters.fileids():
            #odrzucenie dokumentów, które mają więcej niż 1 kategorię
            if len(reuters.categories(docid)) > 1:
                continue

            #określenie kategorii dokumentu
            cat = 0
            for i in range(90):
                if self.categories[i] in reuters.categories(docid):
                    cat = i

            #określenie czy dokument jest przeznaczony do treningów czy do testów
            if docid.startswith("train"):
                train = 1
            elif docid.startswith("test"):
                train = 0
            else:
                raise ()
            text = reuters.raw(docid)
            doc = document(text, cat, train)
            #dodanie dokumentu do klasy TfIdf - potrzebne do późniejszych obliczeń
            self.tfidf.add_document(doc)
            #dodanie dokumentu do tablicy dokumentów
            self.add_document(doc)
        self.initialize_vocabulary()
Exemplo n.º 10
0
def reuters_dataset():
    nltk.download('reuters')
    nltk.download('stopwords')
    stop_words = stopwords.words("english")

    documents = reuters.fileids()

    train_docs_id = [doc for doc in documents if doc.startswith("train")]
    test_docs_id = [doc for doc in documents if doc.startswith("test")]

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

    print(len(train_docs), len(test_docs))

    vectorizer = TfidfVectorizer(stop_words=stop_words)

    vectorised_train_documents = vectorizer.fit_transform(train_docs)
    vectorised_test_documents = vectorizer.transform(test_docs)

    # print([reuters.categories(doc_id) for doc_id in test_docs_id])

    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform(
        [reuters.categories(doc_id) for doc_id in train_docs_id])
    test_labels = mlb.transform(
        [reuters.categories(doc_id) for doc_id in test_docs_id])

    return vectorised_train_documents.toarray(
    ), vectorised_test_documents.toarray(), train_labels, test_labels
Exemplo n.º 11
0
def reuters_to_df(set_name, label_to_idx):

    data = [x for x in reuters.fileids() if set_name in x]

    # collect all data to create df from
    all_texts = [
        " ".join([" ".join(sen) for sen in reuters.sents(doc_id)])
        for doc_id in data
    ]

    all_labels = np.zeros((len(all_texts), len(label_to_idx)))
    all_label_indices = [[
        label_to_idx[lab] for lab in reuters.categories(doc_id)
    ] for doc_id in data]

    for i, labs in enumerate(all_label_indices):
        # binary encode the labels
        all_labels[i][labs] = 1

    all_labels = all_labels.astype(int)
    # all_labels[all_label_indices] = 1
    cols = ["text"]
    label_cols = ["topic_{}".format(lab) for lab in reuters.categories()]
    cols.extend(label_cols)
    # create df and set values
    df = pd.DataFrame(columns=cols)
    df["text"] = all_texts
    df[label_cols] = all_labels

    return df
def get_data_splits():
    train_docs, train_labels = zip(*[(reuters.raw(i), reuters.categories(i))
                                     for i in reuters.fileids()
                                     if i.startswith('training/')])
    test_docs, test_labels = zip(*[(reuters.raw(i), reuters.categories(i))
                                   for i in reuters.fileids()
                                   if i.startswith('test/')])
    return train_docs, train_labels, test_docs, test_labels
Exemplo n.º 13
0
def print_reuters():
    from nltk.corpus import reuters
    # print reuters.fileids()
    # print reuters.categories()
    print reuters.categories('training/9865')
    print reuters.categories(['training/9865','training/9880'])
    print reuters.fileids('barley')
    print reuters.fileids(['barely','corn'])
Exemplo n.º 14
0
 def __init__(self):
     # print reuters categories
     print "reuters categories"
     print reuters.categories()
     # TODO this is probably bad
     print "getting nodes"
     self.nodes = database.get_all_nodes()
     print "training classifier"
     self.classifier = DocumentClassifier()
Exemplo n.º 15
0
def explore_categories(max_len=5000, min_len=100, percentage=0.3):
    for cat in reuters.categories():
        for cat2 in reuters.categories():
            if cat2 > cat:
                if  len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
                    l1 = len(reuters.fileids(cat))
                    l2 = len(reuters.fileids(cat2))
                    if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
                        print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
Exemplo n.º 16
0
    def __init__(self, min_eic=5):

        self.test_classes = []
        self.test_docs = []
        self.train_classes = []
        self.train_docs = []
        self.table_of_classes = []
        self.num_of_instances = []

        # mininimal encounter in classes
        self.min_eic = min_eic

        if Path("training_cache/train_docs").is_file() and Path("training_cache/train_classes").is_file() \
                and Path("training_cache/test_docs").is_file() and Path("training_cache/test_classes").is_file() \
                and Path("classify_cache/table_of_classes").is_file():
            self.train_docs = joblib.load("training_cache/train_docs")
            self.train_classes = joblib.load("training_cache/train_classes")

            self.test_docs = joblib.load("training_cache/test_docs")
            self.test_classes = joblib.load("training_cache/test_classes")

            self.table_of_classes = joblib.load(
                "classify_cache/table_of_classes")
        else:
            raw_test_classes = []
            raw_train_classes = []

            for doc_id in reuters.fileids():
                if doc_id.startswith("train"):
                    self.train_docs.append(
                        prepare_text_for_analysis(reuters.raw(doc_id)))
                    raw_train_classes.append(reuters.categories(doc_id))
                else:
                    self.test_docs.append(
                        prepare_text_for_analysis(reuters.raw(doc_id)))
                    raw_test_classes.append(reuters.categories(doc_id))

            self.make_table_of_classes(raw_train_classes)
            self.train_classes = self.transform_classes(
                raw_train_classes, "train")
            self.test_classes = self.transform_classes(raw_test_classes,
                                                       "test")

            joblib.dump(self.train_docs,
                        "training_cache/train_docs",
                        compress=9)
            joblib.dump(self.train_classes,
                        "training_cache/train_classes",
                        compress=9)
            joblib.dump(self.test_docs, "training_cache/test_docs", compress=9)
            joblib.dump(self.test_classes,
                        "training_cache/test_classes",
                        compress=9)
            joblib.dump(self.table_of_classes,
                        "classify_cache/table_of_classes",
                        compress=9)
Exemplo n.º 17
0
def main():
    collection_stats()

    print("Staring classifier ..")

    X_train = list()
    X_test = list()

    y_train = list()
    y_test = list()

    print("Reading training and testing data ..")

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            X_train.append(reuters.raw(doc_id))
            y_train.append(reuters.categories(doc_id))
        else:
            X_test.append(reuters.raw(doc_id))
            y_test.append(reuters.categories(doc_id))

    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train)
    X_test = numpy.array(X_test)
    y_test = numpy.array(y_test)

    binarizer = MultiLabelBinarizer(classes=reuters.categories())

    classifier = Pipeline([
        ('vectorizer',
         TfidfVectorizer(tokenizer=tokenize,
                         min_df=0,
                         max_df=0.90,
                         max_features=3000,
                         use_idf=True,
                         sublinear_tf=True)),
        # ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
    print("Training classifier ..")
    classifier.fit(X_train, binarizer.fit_transform(y_train))
    print("Testing classifier ..")
    res = classifier.predict(X_test)

    hard_precision = classifier.score(X_test, binarizer.transform(y_test))

    precision = average_precision_score(res,
                                        binarizer.fit_transform(y_test),
                                        average=None)
    recall = recall_score(res, binarizer.fit_transform(y_test), average=None)
    f1score = f1_score(res, binarizer.fit_transform(y_test), average=None)
    print("Hard precision: " + str(hard_precision))

    log_results(reuters.categories(), precision, recall, f1score)
Exemplo n.º 18
0
def get_test_set():
    single_categories = [(id, re.categories(id)[0])
                         for id in re.fileids()
                         if len(re.categories(id)) == 1]

    single_cat_list = distribution(single_categories, itemgetter(1))
    used_categories = [x[0]
                       for x in single_cat_list
                       if x[1] < 600 and x[1] > 200]

    return [pair for pair in single_categories if pair[1] in used_categories]
Exemplo n.º 19
0
    def get_target(self):

        # cat1 vs. cat2
        if len(self.categories) > 1:
            target = [ [cat for cat in reuters.categories(fileid) if cat in self.categories][0]
                       for fileid in self.fileids]
        # cat1 vs. not cat1
        else:
            target = [ 1 if self.categories[0] in reuters.categories(fileid) else 0
                       for fileid in self.fileids]
        self.classes, target = np.unique(target, return_inverse=True)
        return target
Exemplo n.º 20
0
def labels(filenames, cats=None):
    """Return topic labels (one-hot format) for given files

    :param filenames: selected files from Reuters dataset
    :param cats: categories to filter (optional)
    :return: topic labels (one-hot format)
    """
    if cats is None: cats = reuters.categories()
    data = [[c for c in reuters.categories(f) if c in cats] for f in filenames]
    mb = MultiLabelBinarizer(classes = cats)
    onehot = mb.fit_transform(data)
    df = pd.DataFrame(onehot, columns=cats)
    return df
Exemplo n.º 21
0
    def get_default_split():
        documents = reuters.fileids()
        train_docs_id = list(
            filter(lambda doc: doc.startswith("train"), documents))
        test_docs_id = list(
            filter(lambda doc: doc.startswith("test"), documents))

        X_train = [reuters.raw(doc_id) for doc_id in train_docs_id]
        X_test = [reuters.raw(doc_id) for doc_id in test_docs_id]
        Y_train = [reuters.categories(doc_id) for doc_id in train_docs_id]
        Y_test = [reuters.categories(doc_id) for doc_id in test_docs_id]

        return X_train, Y_train, X_test, Y_test
Exemplo n.º 22
0
def get_labels():
    most_common_class = collections.Counter(
            [c for cs in [reuters.categories(fileid) for fileid in fileids] \
            for c in cs]).most_common(1)[0][0]
    print('Most common class in sampled documents:',
          most_common_class)
    return (
            np.array(
                    [1 if most_common_class in reuters.categories(
                            fileid) else 0 for fileid in fileids], 
                     dtype=np.int32),
            ('other', most_common_class)
            )
Exemplo n.º 23
0
def get_topics(min_samples=None):
    """Return set of topics from Reuters corpus

    If *min_samples* is specified, only topics with at
    least that many examples are included.

    :param min_samples: minimum number of example per topic
    :return: list of topics
    """
    cats = reuters.categories()
    if min_samples is not None:
        cats = [c for c in reuters.categories() if len(reuters.fileids(c)) >= min_samples]
    return cats
Exemplo n.º 24
0
def build_output(dataset='train'):
    documents = reuters.fileids()
    docs_id = list(filter(lambda doc: doc.startswith(dataset), documents))
    output = np.zeros((len(docs_id), len(reuters.categories())))
    reuters_categories = reuters.categories()
    i = 0
    for docs in docs_id:
        if i % 100 == 0:
            print(i)
        for category in reuters.categories(docs):
            output[i, reuters_categories.index(category)] = 1
        i += 1
    return output
Exemplo n.º 25
0
def load_data(valid_percent=0.1):
    """
    Load the Reuters dataset.

    Returns:
        raw text and raw labels for train, valid, test set.
    """

    nltk.download('reuters')
    n_classes = 90
    labels = reuters.categories()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]

    ys = {'train': [], 'test': []}
    ys['train'] = [reuters.categories(doc_id) for doc_id in train]
    ys['test'] = [reuters.categories(doc_id) for doc_id in test]

    # Validation
    n_valid = int(valid_percent * len(ys['train']))
    np.random.seed(5)
    idxs = np.random.choice(len(ys['train']), n_valid, replace=False)
    idx_set = set(idxs)
    docs['valid'] = []
    ys['valid'] = []
    train_docs = []
    train_y = []
    for idx, (x, y) in enumerate(zip(docs['train'], ys['train'])):
        if idx in idx_set:
            docs['valid'].append(x)
            ys['valid'].append(y)
        else:
            train_docs.append(x)
            train_y.append(y)

    data = {
        'x_train': train_docs,
        'y_train': train_y,
        'x_valid': docs['valid'],
        'y_valid': ys['valid'],
        'x_test': docs['test'],
        'y_test': ys['test'],
        'labels': labels
    }
    return data
Exemplo n.º 26
0
def load_data():
    docs = reuters.fileids()
    train_ids = [doc for doc in docs if doc.startswith("train")]
    test_ids = [doc for doc in docs if doc.startswith("test")]

    train_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0])
                               for id in train_ids],
                              columns=('text', 'labels'))

    test_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0])
                              for id in test_ids],
                             columns=('text', 'labels'))

    return train_data, test_data
Exemplo n.º 27
0
def loadReutersData(documents,labels):
    categories_list=['acq','crude','earn','grain','interest','money-fx','ship','trade']
    docCount=0
    for i in range(0,len(categories_list)):
        category_docs = reuters.fileids(categories_list[i])
        print (categories_list[i])
        for document_id in reuters.fileids(categories_list[i]):
            if(len(reuters.categories(document_id))==1):
                content=str(reuters.raw(document_id))
                soup = BeautifulSoup(content)
                content=soup.get_text()
                documents.append(content)
                docCount+=1
                labels.append(str(reuters.categories(document_id)))
    def stats(self):
        """
        :return:    Important statistics about the dataset - numbers of documents in different classes with
                    corresponding percentages, as well as vocabulary sizes for every class.
        """
        lt = LemmaTokenizer()
        train_stats = {}
        test_stats = {}

        for c in reuters.categories():
            train_stats[c] = {
                'num_of_docs': 0,
                'percentage': 0.0,
                'words': set([])
            }
            test_stats[c] = {
                'num_of_docs': 0,
                'percentage': 0.0,
                'words': set([])
            }

        for d in self.train:
            c = reuters.categories(d)[0]
            train_stats[c]['num_of_docs'] += 1
            train_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d)))
        for d in self.test:
            c = reuters.categories(d)[0]
            test_stats[c]['num_of_docs'] += 1
            test_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d)))

        s_train = sum(train_stats[c]['num_of_docs']
                      for c in train_stats.keys())
        s_test = sum(test_stats[c]['num_of_docs'] for c in test_stats.keys())

        res = ({}, {})

        for c in train_stats.keys():
            if train_stats[c]['num_of_docs'] != 0:
                train_stats[c][
                    'percentage'] = train_stats[c]['num_of_docs'] / s_train
                train_stats[c]['words'] = len(train_stats[c]['words'])
                res[0][c] = train_stats[c]
        for c in test_stats.keys():
            if test_stats[c]['num_of_docs'] != 0:
                test_stats[c][
                    'percentage'] = test_stats[c]['num_of_docs'] / s_test
                test_stats[c]['words'] = len(test_stats[c]['words'])
                res[1][c] = test_stats[c]

        return res
Exemplo n.º 29
0
def create_train_data(docs, dictionary):
    all_topics = reuters.categories()
    train_data = []
    for i in range(len(docs)):
        # Process 1 raw document to list without numbers, signs, etc.
        wordlist = process_doc(docs[i])
        len_wordlist = len(wordlist)

        # Remove stop words
        wordlist = remove_stop_words(wordlist)
        len_wordlist_no_sw = len(wordlist)

        # Number of stop words in document
        sw_count = len_wordlist - len_wordlist_no_sw

        # Replace words from doc not included in our dictionary by __OOV__
        fv = []
        fv_nn = []
        for word in dictionary:  # v tvare list slovnikov [ {2:x} , {5:y} ...]
            if word[-1] in wordlist:  # x, y
                count = wordlist.count(
                    word[-1])  # pocet vyskytov daneho slova v dokumente
                # fv.append(dict({count:word[-1]}))  # human readable FV
                # fv_nn.append(count)     # NN readable FV
                fv_nn.append(((count / len_wordlist) *
                              100))  # vyskyt daneho slova v dokumente v %
            else:
                # fv.append('__OOV__')
                fv_nn.append(0.)

        # Pocet vsetkych slov dokumentu, pripojime na koniec FV
        # fv.append(len_wordlist)
        fv_nn.append(len_wordlist)

        # Pocet stop-slov v dokumente, v percentach pripojime na koniec FV
        # fv.append(float("{0:.2f}".format((sw_count/len_wordlist)*100)))
        # fv_nn.append(float("{0:.2f}".format((sw_count/len_wordlist)*100)))
        fv_nn.append(((sw_count / len_wordlist) * 100))

        # Na akej pozicii sa nachadza topic v zozname vsetkych topicov, jeho pozicia je UNIQUE, pripojime na koniec FV
        doc_topic = reuters.categories(docs[i])
        topic_id = all_topics.index(doc_topic[0])
        fv_nn.append(topic_id)

        # V kazdom cykle spracujeme 1 dokument a prilepime jeho vektor vlastnosti + topic id k trenovacim datam
        train_data.append(fv_nn)

    return train_data  # all Feature Vectors as list + result topic
Exemplo n.º 30
0
def create_new_dataset(train_docs_id, test_docs_id, top10_categories):
    #train data
    new_train_docs_id = []
    y__train = []
    instaces_vector = [0, 0]  #vettore per contare istanze di earn e di acq
    for train_doc in train_docs_id:
        max_instances = float('inf')
        new_category = ''
        for doc_category in reuters.categories(train_doc):
            for i in range(0, len(top10_categories)):
                if doc_category == top10_categories[i][
                        1] and max_instances > top10_categories[i][0]:
                    new_category = doc_category
                    max_instances = top10_categories[i][0]
        if new_category != '':  #se il documento ha almeno una classe appartenente alla top10
            if new_category == u'earn':
                if instaces_vector[0] < 500:
                    instaces_vector[0] += 1
                    new_train_docs_id.append(train_doc)
                    y__train.append(new_category)
            elif new_category == u'acq':
                if instaces_vector[1] < 500:
                    instaces_vector[1] += 1
                    new_train_docs_id.append(train_doc)
                    y__train.append(new_category)
            else:
                new_train_docs_id.append(train_doc)
                y__train.append(new_category)

    # test data
    new_test_docs_id = []
    y__test = []
    instaces_vector = [0, 0]
    for test_doc in test_docs_id:
        max_instances = float('inf')
        new_category = ''
        for doc_category in reuters.categories(test_doc):  #ciclo sui documenti

            for i in range(0, len(top10_categories)):
                if doc_category in top10_categories[i][
                        1] and max_instances > top10_categories[i][0]:
                    new_category = doc_category
                    max_instances = top10_categories[i][0]
            if new_category != '':  # se il documento ha almeno una classe appartenente alla top10
                new_test_docs_id.append(test_doc)
                y__test.append(new_category)

    return new_train_docs_id, y__train, new_test_docs_id, y__test
Exemplo n.º 31
0
def create_tfidf_data(docs,categories,n=None):
    """
    Crea una struttura [(label,[parole])] parsando il documento
    :param docs: lista dei documenti reuters
    :param categories: nomi delle categorie da considerare
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]

    cat_num = {}; i = 1
    for c in categories:
        cat_num[c] = i
        i += 1

    y = []
    corpus = []
    for d in docs:
        c = reuters.categories(d)[0]
        if c in categories:
            y.append(getSVMCategory(cat_num[c]))
            corpus.append(reuters.raw(d).lower())

    return y, corpus
Exemplo n.º 32
0
def categorize_reuters():
    '''
    Parses dataset to only examine documents associated
    with a single category.
    '''
    categories = {}
    for file_id in reuters.fileids():
        if len(reuters.categories(file_id)) == 1:
            cat = reuters.categories(file_id)[0]
            if cat not in categories.keys():
                categories[cat] = {}

        text = reuters.raw(file_id)
        categories[cat][file_id.replace('/', '_')] = text

    return categories
def get_testset_trainset_nltk_reuters():
    from nltk.corpus import reuters
    global categories_file_name_dict
    global cat_num_docs
    clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]    
    testset = [f for f in clean_files if f[:5]=='test/']
    trainset = [f for f in clean_files if f[:9]=='training/']
    for cat in reuters.categories():
        li=[f for f in reuters.fileids(categories=cat) if f in trainset]
        li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
        if len(li)>20 and len(li_te)>20:
            cat_num_docs[cat]=len(li)
            li.extend(li_te)
            categories_file_name_dict[cat]=li
    return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
            [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]            
def f2c(corpus,fileName):
    if corpus=='mr':
        from nltk.corpus import movie_reviews as mr
        return mr.categories(fileids = fileName)[0]    
    else:
        from nltk.corpus import reuters
        return reuters.categories(fileids = fileName)[0]    
Exemplo n.º 35
0
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
    labeled_words = []

    for label in reuters.categories():
        labeled_words.append((label, reuters.words(categories=[label])))

    return high_information_words(labeled_words, score_fn=score_fn)
Exemplo n.º 36
0
def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");
	
	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");
	
	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));	
	print(str(len(test_docs)) + " total test documents");

	# List of categories 
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);	

	# Raw document
	print(reuters.raw(document_id));
Exemplo n.º 37
0
def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")

    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    docs_per_category = list()
    # Documents in a category
    for category in categories:
        category_docs = reuters.fileids(category)
        print('for the following category: {} this is the amount of docs: {}'.
              format(category, len(category_docs)))
        docs_per_category.append(len(category_docs))

    print('mean of docs per category is: {}'.format(
        np.mean(np.array(docs_per_category))))
    print('standard deviation of docs per category is: {}'.format(
        np.std(np.array(docs_per_category))))
    print('this is the min docs per category: {}'.format(
        min(docs_per_category)))
    print('this is the max docs per category: {}'.format(
        max(docs_per_category)))
Exemplo n.º 38
0
def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
    print(str(len(train_docs)) + " total train documents")

    test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
    print(str(len(test_docs)) + " total test documents")

    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # Documents in a category
    category_docs = reuters.fileids("acq")

    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # Raw document
    print(reuters.raw(document_id))
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
	labeled_words = []
	
	for label in reuters.categories():
		labeled_words.append((label, reuters.words(categories=[label])))
	
	return high_information_words(labeled_words, score_fn=score_fn)
Exemplo n.º 40
0
def get_documents():
    """
    Get documents from 20 News Groups, Movie Reviews and Reuters corpora.
    
    Returns:
        list of str: Small subset of documents from News Groups, Movie Reviews 
            and Reuters corpora
    """
    dataset = fetch_20newsgroups(subset='all',
                                 shuffle=True,
                                 remove=('headers', 'footers', 'quotes'))
    corpus_20newsgroups = dataset.data[:5]

    tuples = [(movie_reviews.raw(fileid), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

    corpus_movies = [tuple_[0] for tuple_ in tuples]
    shuffle(corpus_movies)
    corpus_movies = corpus_movies[:5]

    tuples = [(reuters.raw(fileid), reuters.categories(fileid))
              for fileid in reuters.fileids()]
    corpus_reuters = [tuple_[0] for tuple_ in tuples]
    shuffle(corpus_reuters)
    corpus_reuters = corpus_reuters[:5]

    corpus = list()
    corpus.extend(corpus_20newsgroups)
    corpus.extend(corpus_movies)
    corpus.extend(corpus_reuters)

    return corpus
    def __run2__(trainDocs, testDocs):
        """
		This method takes the filtered&processed train and processed test docs and performs the nearest neighbor retrieve
		with n=k. With this result, the k nearest neighbors of this nearest neighbor are categorized using kNN similairty scoring
		and the required values necessary for F1 evaluation are returned.

		"""

        #testDocs is of form {docId : doc_vec}
        fp = 0
        tp = 0
        fn = 0
        for docId, vec in testDocs.items():
            nearest_neighbors = hyperplaneClassifier.nearestNeighbors(
                trainDocs, docId, vec)
            cats = set(
                hyperplaneClassifier.categorize_kNNSimScoring(
                    testDocs, trainDocs, docId, nearest_neighbors))
            print(cats)
            actual_cats = set(reuters.categories(docId))
            print(actual_cats)
            print("\n")
            fp += len(cats.difference(actual_cats))
            tp += len(cats.intersection(actual_cats))
            fn += len(actual_cats.difference(cats))
        return fp, tp, fn
Exemplo n.º 42
0
def import_reuters_files(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(reuters.fileids())
        counter = 0
    root_handle = ds.insert("#reuters")
    for fileid in reuters.fileids():
        tags = ["@%s" % category for category in reuters.categories(fileid)]
        file_handle = ds.insert(["#%s" % fileid] + tags)
        ds.link(root_handle, file_handle)
        for sent in reuters.sents(fileid):
            norm = [word.lower() for word in sent]
            sen_handle = ds.insert(norm)
            ds.link(file_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 10 == 0):
                print("importing %s of %s files..." % (counter, total), 
                    file=log)
Exemplo n.º 43
0
 def __init__(self):
     # Generate training set from sample of Reuters corpus
     train_docs = [(self.bag_of_words(reuters.words(fileid)), category)
                   for category in reuters.categories()
                   for fileid in reuters.fileids(category) if
                   fileid.startswith("train")]
     # Create a classifier from the training data
     self.classifier = NaiveBayesClassifier.train(train_docs)
Exemplo n.º 44
0
def format_data(docs, all_categories):
    y = []; corpus = []
    for d in docs:
        current_categories = filter(lambda x: x in all_categories,reuters.categories(d))
        if current_categories:
            y.append(current_categories[0])
            corpus.append(reuters.raw(d).lower())
    return y, corpus
Exemplo n.º 45
0
def makeWordSet(args=None):
    '''Use the Brown corpus to see how many words used'''
    word_set = set()
    for cat in brown.categories():
        word_set = word_set.union(set(brown.words(categories=cat)))
    for cat in reuters.categories():
        word_set = word_set.union(set(reuters.words(categories=cat)))
    return word_set
Exemplo n.º 46
0
 def __init__(self, dataset=''):
     """
         Docs in reuters corpus are identified by ids like "training|test/xxxx".
     :param dataset: filter for ids
     """
     self.dataset = dataset # filter docs
     self.categories = {c: n for n, c in enumerate(reuters.categories())} # map class with int
     self.docs = {d: n for n, d in enumerate(reuters.fileids())}  # map docs with int
     self.category_mask = [] # mask nth doc with its ith class
Exemplo n.º 47
0
 def __iter__(self):
     """ Generator of docs while collecting ordered structured info. """
     for n, reutersid in enumerate(reuters.fileids()):         # 'training|test/xxxx'
         dataset, _ = reutersid.split('/')       # extract dataset
         if self.dataset in dataset:             # yield only filtered dataset
             if self.categories is not None:
                 top_category = reuters.categories(reutersid)[0]            # grab first category only
                 self.category_mask.append(self.categories[top_category])   # n-th doc -> classid
             yield reuters.raw(reutersid)        # return raw document
Exemplo n.º 48
0
def reuters_train_test_feats(feature_detector=bag_of_words):
	train_feats = []
	test_feats = []
	for fileid in reuters.fileids():
		if fileid.startswith('training'):
			featlist = train_feats
		else:   # fileid.startswith('test')
			featlist = test_feats
		feats = feature_detector(reuters.words(fileid))
		labels = reuters.categories(fileid)
		featlist.append((feats, labels))
	return train_feats, test_feats
Exemplo n.º 49
0
def create_tfidf_data(docs,n=None):
    """
    Crea una struttura [(label,[parole])] togliendo le stopwords
    e parsando il documento
    :param docs: lista dei documenti reuters
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]
    y = [reuters.categories(d)[0] for d in docs]
    corpus = [reuters.raw(d).lower() for d in docs]
    return y, corpus
Exemplo n.º 50
0
def makeData(file, set):
    labels = []
    f = open(file, "w")
    for doc in set:
        title = []
        label = reuters.categories(doc)[0]
        labels.append(label)
        for i in reuters.words(doc):
            if not i.isupper():
                break
            else:
                title.append(i)
        f.write(' '.join(title) + "\n")
    f.close()

    f = open("labels" + file, "w")
    f.write("\n".join(labels))
    f.close()
Exemplo n.º 51
0
    def computeStats(self, categories):
        files = batchReadReuters('training', categories)
        for file_name in files:
            raw_txt = readFromFile('/home/dales3d/nltk_data/corpora/reuters/' + file_name)
            fileCategories = reuters.categories(file_name)
            #for cat in categories:
            #	if cat not in self.activeCategories:
            #		self.activeCategories.append(cat)
            self.activeCategories = categories

            words = extractWords(raw_txt)
            keywords = meter(words)
            for word in keywords:
                if word not in self.wordsStatDict:
                    self.wordsStatDict[word] = WordStats()
                w_stat = self.wordsStatDict[word]
                w_stat.word = word
                w_stat.addText(file_name, keywords[word], fileCategories)
Exemplo n.º 52
0
def reuters():

    reuters.fileids()
    reuters.categories()

    reuters.categories('training/9865')
    reuters.categories(['training/9865', 'training/9880'])
    reuters.fileids('barley')
    reuters.fileids(['barley', 'corn'])

    reuters.words('training/9865')[:14]
    reuters.words(['training/9865', 'training/9880'])
    reuters.words(categories='barley')
    reuters.words(categories=['barley', 'corn'])
Exemplo n.º 53
0
def exercise_reuters():
    print reuters.fileids()
    print reuters.categories()
    # 查看单个文档的主题
    print reuters.categories("training/9865")
    # 查看多个文档的主题
    print reuters.categories(["training/9865", "training/9880"])
    # 查看每个主题的文档
    print reuters.fileids("barley")
    # 查看多个主题的文档
    print reuters.fileids(["barley", "corn"])
    # 查看某个文档的词汇
    print reuters.words("training/9865")
    # 查看多个文档的词汇
    print reuters.words(["training/9865", "training/9880"])
    # 查看某个主题的词汇
    print reuters.words(categories="barley")
    # 查看多个主题的词汇
    print reuters.words(categories=["barley", "corn"])
from nltk.tokenize import word_tokenize
from sklearn.preprocessing.label import MultiLabelBinarizer

nltk.download('reuters')
nltk.download('punkt')

google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz'
doc2vec_model_location = 'model/doc2vec-model.bin'
doc2vec_dimensions = 300
classifier_model_location = 'model/classifier-model.bin'

doc2vec = Doc2Vec.load(doc2vec_model_location)

# Convert the categories to one hot encoded categories
labelBinarizer = MultiLabelBinarizer()
labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()])

# Convert load the articles with their corresponding categories
train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')]
test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')]
shuffle(train_articles)
shuffle(test_articles)

# Convert the articles to document vectors using the doc2vec model
train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles]
test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles]
train_labels = labelBinarizer.transform([article['categories'] for article in train_articles])
test_labels = labelBinarizer.transform([article['categories'] for article in test_articles])
train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels)

# Initialize the neural network
Exemplo n.º 55
0
def load_reuters(setName):
    html = HTMLParser.HTMLParser()
    doc_ids = reuters.fileids()
    cat2all_ids = {}
    cat2train_ids = {}
    cat2test_ids = {}
    cat2all_num = {}
    cand_docNum = 0
    
    for doc_id in doc_ids:
        # only choose docs belonging in one category
        if len( reuters.categories(doc_id) ) == 1:
            cat = reuters.categories(doc_id)[0]
            cand_docNum += 1
            
            if doc_id.startswith("train"):
                cat2set_ids = cat2train_ids
            else:
                cat2set_ids = cat2test_ids
                
            if cat in cat2set_ids:
                cat2set_ids[cat].append(doc_id)
            else:
                cat2set_ids[cat] = [ doc_id ]
            
            # both train and test doc_ids are put in cat2all_ids
            if cat in cat2all_ids:
                cat2all_ids[cat].append(doc_id)
            else:
                cat2all_ids[cat] = [ doc_id ]
            if cat in cat2all_num:
                cat2all_num[cat] += 1
            else:
                cat2all_num[cat] = 1
            
    print "Totally %d docs, %d single-category docs in %d categories" %( len(doc_ids), 
                    cand_docNum, len(cat2train_ids) )
                    
    sorted_cats = sorted( cat2all_num.keys(), key=lambda cat: cat2all_num[cat],
                            reverse=True )
                            
    catNum = 10
    cats_docsWords = [ [] for i in xrange(catNum) ]
    cats_docNames = [ [] for i in xrange(catNum) ]
                            
    topN_cats = sorted_cats[:catNum]
    print "Top 10 categories:"
    keptAllDocNum = 0
    keptTrainDocNum = 0
    keptTestDocNum = 0
    
    for cat in topN_cats:
        print "%s: %d/%d" %( cat, len(cat2train_ids[cat]), len(cat2test_ids[cat]) )
        keptTrainDocNum += len(cat2train_ids[cat])
        keptTestDocNum += len(cat2test_ids[cat])
        keptAllDocNum += len(cat2train_ids[cat]) + len(cat2test_ids[cat])
        
    print "Totally %d docs kept, %d in train, %d in test" %( keptAllDocNum, 
                        keptTrainDocNum, keptTestDocNum )    
    
    if setName == "train":
        cat2set_ids = cat2train_ids
        setDocNum = keptTrainDocNum
    elif setName == "test":
        cat2set_ids = cat2test_ids
        setDocNum = keptTestDocNum
    elif setName == "all":
        cat2set_ids = cat2all_ids
        setDocNum = keptAllDocNum
    else:
        raise Exception("Unknown set name %s" %setName)
            
    orig_docs_name = []
    orig_docs_cat = []
    orig_docs_words = []
    readDocNum = 0
    totalLineNum = 0
    emptyFileNum = 0
    
    for cat_id, cat in enumerate(topN_cats):
        for doc_id in cat2set_ids[cat]:
            if readDocNum % 50 == 49 or readDocNum == setDocNum - 1:
                print "\r%d %d\r" %( readDocNum + 1, totalLineNum ),
            text = html.unescape( reuters.raw(doc_id) )
            text = text.encode("utf-8")
            lines = text.split("\n")
            if len(text) == 0 or len(lines) == 0:
                emptyFileNum += 1
                continue
        
            readDocNum += 1
            totalLineNum += len(lines)
        
            text = " ".join(lines)
            wordsInSentences, wc = extractSentenceWords(text)
            
            filename = doc_id
            orig_docs_words.append( wordsInSentences )
            orig_docs_name.append(filename)
            orig_docs_cat.append(cat_id)
            cats_docsWords[cat_id].append(wordsInSentences)
            cats_docNames[cat_id].append(filename)
            
    print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
    return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
                cats_docsWords, cats_docNames, topN_cats
    
Exemplo n.º 56
0
    print m + ':', fdist[m]


cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

#路透语料库

from nltk.corpus import reuters
reuters.fileids()
reuters.categories()

reuters.categories(['training/9865', 'training/9880'])
reuters.fileids(['barley', 'corn'])
reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories=['barley', 'corn'])

#演说语料库
from nltk.corpus import inaugural
inaugural.fileids()
#多国世界人权宣言
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
Exemplo n.º 57
0
    def first_occurrence_perc(self, term, document):
        return term.first_occurrence(document) / len(document.doc)

    # def terminals(self, term, document):
    #     return (
    #         self.bool(term, document), self.tf(term, document), self.tf_idf(term, document), self.tf_ig(term, document),
    #         self.tf_chi(term, document), self.tf_rf(term, document))

    def raw_terminals(self, term, document):
        return (self.bool(term, document), self.tf(term, document), self.max_prob_term_and_category(term, document),
                self.max_prob_term_not_category(term, document), self.avg_prob_term_category(term, document),
                self.avg_prob_term_not_category(term, document), self.first_occurrence_perc(term, document))


if __name__ == '__main__':
    training_fileids = fileids = filter(lambda fileid: "training" in fileid and len(reuters.categories(fileid)) == 1,
                                        reuters.fileids())
    documents = [sum(reuters.sents(fid), []) for fid in training_fileids]
    doc = documents[0]
    term = terminals.WordTerm("in")
    docs_categories = [reuters.categories(fid)[0] for fid in training_fileids]
    print docs_categories
    print doc
    fe = TWSCalculator(documents, docs_categories)

    print "tf =", fe.tf(term, doc), "idf =", fe.idf(term), "tf-idf =", fe.tf_idf(term, doc)

    term = terminals.WordTerm("in")

    print 'TF-CHI: ', fe.tf_chi(term, doc)
    print 'TF-CHI: ', fe.tf_chi(term, doc)
Exemplo n.º 58
0
	for word in flat_feat_pruned:
		features['%s' % word]=doc.count(word)
	return features

#full_feat=[(doc_features(cfd[condition].keys()),condition=='gold') for condition in cfd.conditions()]

#-------------------------------------
#Use chosen features to create train/test sets
#either based on boolean or count of features in docs
#-------------------------------------


#reutDocs=[(list(reuters.words(fileid)),cat)
#better way: lower, stem, and only keep alpha numeric
reutDocs=[([porter.stem(a.lower()) for a in reuters.words(fileid) if re.findall(r"[^\W\d]",a)] ,cat)
for cat in reuters.categories()
for fileid in reuters.fileids(cat)]

#randomize for later test/train split
random.seed(1979)
random.shuffle(reutDocs)



#run feature extractor on reutDocs instead of cfd; pickled as 'featureset.pkl'
#Boolean based features e.g. 1 or 0 for each word in flat_feat_pruned per document
featureset=[(doc_features(d),c) for (d,c) in reutDocs]
#Count based features
count_featureset=[(count_features(d),c) for (d,c) in reutDocs]

#-------------------------------------
Exemplo n.º 59
0
def encode_doc(doc):
    d = np.zeros((1, max_len-1, n_chars), dtype=np.bool)
    for p, j in enumerate(doc.lower()[:max_len]):
        d[0, p, char_to_idx[j]] = 1
    return d

# load or create the character encoding dictionaries
if os.path.exists(char_idx_path):
    with open(char_idx_path, 'rb') as f:
        logger.info('Loading character encodings from "%s"' % char_idx_path)
        idx_to_char = pickle.load(f)
        char_to_idx = pickle.load(f)
        cat_enc = pickle.load(f)
else:
    n_docs = len(reuters.fileids())
    cat_enc = dict((x, i+1) for i, x in enumerate(set(reuters.categories())))

    chars = set()
    for fid in reuters.fileids():
        chars = chars.union(set(reuters.raw(fid).lower()))

    idx_to_char = dict((i, c) for i, c in enumerate(chars))
    char_to_idx = dict((c, i) for i, c in enumerate(chars))

    with open(char_idx_path, 'wb') as f:
        logger.info('Saving character encodings to "%s"' % char_idx_path)
        pickle.dump(idx_to_char, f)
        pickle.dump(char_to_idx, f)
        pickle.dump(cat_enc, f)

if os.path.exists(reuters_enc_path):
Exemplo n.º 60
0
# Logistic Regression parameters
ITERATIONS = 800
ALPHA = 0.1
THRESHOLD = 0.5
#LAMBDA = 1

## Extract the fileids of the necessary subset of documents ##
documents_fileids = reuters.fileids(CATEGORIES)

## Split into training and testing ##
training_fileids = [w for w in documents_fileids if w.startswith('training')]
testing_fileids = [w for w in documents_fileids if w.startswith('test')]

## Extract features ##
most_frequent_words = extract_most_frequent_words(training_fileids, NUM_MOST_FREQUENT)
training_featureset = [(document_features(reuters.words(fileid), most_frequent_words), reuters.categories(fileid)) for fileid in training_fileids]
testing_featureset = [(document_features(reuters.words(fileid), most_frequent_words), reuters.categories(fileid)) for fileid in testing_fileids]

## Train a classifier ##
# Create a matrix with the values of the features to send to the classifier
training_matrix = np.array([list(x.values()) for x, y in training_featureset])
# Create a matrix with only the classes labels of each document
classes_vector = [y for x, y in training_featureset]
n = training_matrix.shape[1] # Number of features
m = len(training_matrix) # Number of training documents
# Add the bias term to the training matrix
training_matrix = np.concatenate((np.ones((m,1)), training_matrix), axis = 1)

# Create ten classifiers
classifiers = dict( (c, np.zeros((n+1,1)) ) for c in CATEGORIES )