Пример #1
0
    def __init__(self, root, **kwargs):
        """
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

        @type  root: string
	@param root: The directory path to the corpus directory.
        """
        self._root = root
        fp = open( '%s/corpus_info.json' % (root), 'r' )
        self._corpus_info = info = json.load(fp)
        fp.close()

        # doc_part is specific to PLoS and research article in general.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to readi only one.
	if 'doc_part' in kwargs:
            self._doc_part = doc_part = kwargs['doc_part']
	    del kwargs['doc_part']
	else:
	    self._doc_part = doc_part = 'body'
	if 'fileids' not in kwargs:
            fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] 
        else:
	    fileids =  kwargs['fileids']
        # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
	cat_map = {}
        for d,cat in info['d2c'].iteritems():
            cat_map[doi2fn(d, doc_part)] = cat

	kwargs['cat_map'] = cat_map
	# Subclass of Categorized Plaintext Corpus Reader
        CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
Пример #2
0
 def _extract_meta_data(self, corpus_path):
     self.db = CategorizedPlaintextCorpusReader(corpus_path,
                                                r'.*\.txt',
                                                cat_pattern=r'(\w+)/*')
     new_corpus = Corpus(self.db, corpus_path, self.limit_memory,
                         self.verbose)
     return new_corpus.get_meta_features()
Пример #3
0
    def __init__(self,
                 input_folder_name,
                 doc_pattern,
                 categ_pattern,
                 encoding='utf-8'):
        CategorizedPlaintextCorpusReader.__init__(self,
                                                  input_folder_name,
                                                  doc_pattern,
                                                  cat_pattern=categ_pattern)
        self.input_folder_name = input_folder_name
        self.encoding = encoding
        self.root_reader = PlaintextCorpusReader(input_folder_name,
                                                 fileids=r'[^\/]*.' +
                                                 doc_pattern[-3:])
        #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()]

        self.root_ids = list(self.root_reader.fileids())
Пример #4
0
 def predict(self, test_path):
     documents, self.y_test = self._read_corpus(
         CategorizedPlaintextCorpusReader(test_path,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*'),
         test_path)
     self.X_test = self.representation.transform(documents)
     return self.automl.predict(self.X_test)
Пример #5
0
  def __init__(self, root, **kwargs):
    """ 
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

    @type  root: string
	@param root: The directory path to the corpus.
    """
    self._root = root
    
    # corpus type is specific to Plos_builder
    # full - all documents that were built.
    # partial - documents excluding training 
    # training - documents intended for training
    if 'corpus_type' in kwargs:
      self._corpus_type = kwargs['corpus_type']
      del kwargs['corpus_type']
    else:
      self._corpus_type = 'full'
    
    fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type)
    with open( fn, 'r' ) as fp:
      self._corpus_info = info = json.load(fp)

    # doc_part is specific to PLoS and research article.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to read only one.
    if 'doc_part' in kwargs:
      self._doc_part = doc_part = kwargs['doc_part']
      del kwargs['doc_part']
    else:
      self._doc_part = doc_part = 'body'
    
    if 'fileids' not in kwargs:
      fileids = [ doi2fn(d, doc_part) for d in self.dois() ] 
    else:
	    fileids =  kwargs['fileids']
    # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
    kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() }
	  # Subclass of Categorized Plaintext Corpus Reader
    CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
Пример #6
0
	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		print docs.categories()
		documents = [(list(docs.words(fileid)), category)
				for category in docs.categories()
				for fileid in docs.fileids(category)
		]
		random.shuffle(documents)
		return documents
Пример #7
0
	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		for cat in docs.categories():
			self.cat_gram_freq[cat] = {}
			self.cat_word_freq[cat]={}
		return ((category,list(docs.words(fileid))) 
			for category in docs.categories() 
			for fileid in docs.fileids(category))
Пример #8
0
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('directory',
                        help="the bill directory")
    parser.add_argument('--bigrams', action='store_true', dest='bigrams',
                        default=False, help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(
        root=args.directory,
        fileids=".*/.*\.txt",
        cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6
Пример #9
0
import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('ENGLISH',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids()[100:110])
print(corpus.words())
classifier.show_most_informative_features(5)

#Document Classification

#Load Libraries

import os
import random
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

#Read the dataset into the categorized corpus

# Directory of the corpus
corpusdir = 'corpus/' 
review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt')

# list of documents(fileid) and category (pos/neg)
documents = [(list(review_corpus.words(fileid)), category)
              for category in review_corpus.categories()
              for fileid in review_corpus.fileids(category)]
random.shuffle(documents)

for category in review_corpus.categories():
    print(category)

type(review_corpus)

len(documents)

#Compute word frequency
Пример #11
0
    parser = argparse.ArgumentParser()
    parser.add_argument('directory', help="the bill directory")
    parser.add_argument('--bigrams',
                        action='store_true',
                        dest='bigrams',
                        default=False,
                        help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(root=args.directory,
                                              fileids=".*/.*\.txt",
                                              cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6
Пример #12
0
import os
import nltk
import re
from math import log
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.text import Text
from nltk import ConditionalFreqDist, FreqDist
from nltk.stem.snowball import FrenchStemmer

stemmer = FrenchStemmer()
stopwordsdir = "C:/Projects/Allocine/stopwords/used"
stopwords = []
root = "C:/Projects/Allocine/corpus2/"
cats = ['cine', 'autre', 'critique', 'critique_a']
reader = CategorizedPlaintextCorpusReader(root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin-1')

text_all = Text(reader.words())
text_cine = Text(reader.words(categories='cine'))
text_autre = Text(reader.words(categories='autre'))
text_critique = Text(reader.words(categories='critique'))
text_critique_a = Text(reader.words(categories='critique_a'))
texts_list = [text_cine, text_autre, text_critique, text_critique_a]


def remove_accents(text):
    text = re.sub("[àâäÄÂÀ]", "a", text)
    text = re.sub("[éèêëÈÊËÉ]", "e", text)
    text = re.sub("[ïîìÏÎÌ]", "i", text)
    text = re.sub("[öôòÖÔÒ]", "o", text)
Пример #13
0

# fileids_ = corpus_dir + '/rt-polarity*'

corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata'

cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']}

corpus_treatment(corpus_dir)

encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data')
fileids_ = '^rt-polarity.*'

categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader(
    root=encoded_corpus_dir,
    cat_map=t_map_,
    fileids=fileids_,
)

pos_words = categorized_plaintext_corpusreader.words(categories=['pos'])
pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos'])
pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos'])

neg_words = categorized_plaintext_corpusreader.words(categories=['pos'])
neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg'])
neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg'])

# NOTE: para views are not working to be looked into later

# classification
train = pos_words
Пример #14
0
import sys
import cPickle as pickle

from itertools import chain

# from nltk import trigrams, word_tokenize, sent_tokenize, FreqDist
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.util import ingrams

n = 3

train_path = "data/task1_train"

print "Loading categorized corpus in", train_path, "..."

cr = CategorizedPlaintextCorpusReader(train_path, ".*", cat_pattern="(\w*)")

# Get categories
print "%d categories: %s" % (len(cr.categories()), ", ".join(cr.categories()))

for c in [cr.categories()[0]]:
    print c + "..."
    sys.stdout.flush()

    ngrams = {}
    for i in range(n, 0, -1):
        print str(i) + "-grams..."
        ngrams[i] = {}
        prefix = ("",) * (i - 1)
        for ngram in ingrams(chain(prefix, cr.words(categories=[c])), n):
            if not ngram in ngrams[i]:
Пример #15
0
             if type(i) == Tree:
                     current_chunk.append(" ".join([token for token, pos in i.leaves()]))
             elif current_chunk:
                     named_entity = " ".join(current_chunk)
                     if named_entity not in continuous_chunk:
                             continuous_chunk.append(named_entity)
                             current_chunk = []
             else:
                     continue
     return continuous_chunk



# create a corpus from the txt files given, with a file of categories to apply to the texts
corpus = CategorizedPlaintextCorpusReader(
                           'corpus/', 
                           r'.*\.txt',
                           cat_file="../textcats.prn")
"""
fileid="nytimes-2017.txt"
raw = corpus.raw(fileid)
raw = raw.replace("N.H.S.", "NHS")
words = word_tokenize(raw)
words = corpus.words(fileid)
clean0 = [word for word in words if word not in stoplist]
"""

bloblist = corpus.fileids()
#bloblist = corpus.fileids(categories='2016')
M=len(bloblist)
# Look at the categories
corpus.categories()
Пример #16
0
import sys
from time import time
from itertools import chain
import cPickle as pickle


from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist
from nltk.util import ingrams


print 'Loading corpus...',
t = time()

train_path = 'data/task1_train'
cr = CategorizedPlaintextCorpusReader(train_path, '.*', cat_pattern='(\w*)')

t = time() - t
print str(t) + 's'

# Test generation of CFD
print 'Creating CFD...',
sys.stdout.flush()
t = time()

cat = cr.categories()[0]

n = 3

cfd = ConditionalFreqDist()
prefix = ('',) * (n - 1)
Пример #17
0
import os
import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

path = os.path.join(os.getcwd(), "debates")

DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(path,
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)


def tag_corpus(corpus):
    return [nltk.pos_tag(sent) for sent in corpus.sents()]


tagged_corpus = tag_corpus(corpus)

import spacy

nlp = spacy.load('en')


def spacy_ner(tokenized_sent):
    doc = nlp(' '.join(tokenized_sent))
    for ent in doc.ents:
        return ent.text, ent.label_

Пример #18
0
def create_corpus(directory):
    word_tokenize = RegexpTokenizer('\w+(?:-\w+)*(?:[?!.,:])*')
    sent_tokenize = nltk.data.load('tokenizers/punkt/french.pickle')
    translation = str.maketrans("", "", ",.?!:")
    corpus = CategorizedPlaintextCorpusReader(directory, r"^[^.]*$", cat_file='cats.txt', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize)
    return corpus
Пример #19
0
                    label=target_name)
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title('PCA of BULATS dataset')
    plt.show()

    return model


if __name__ == "__main__":
    PATH = "model.pickle"
    # Loading speech features
    speech = pd.read_csv("/ExamplePath.csv")

    if not os.path.exists(PATH):
        nli = CategorizedPlaintextCorpusReader(CORPUS,
                                               DOC_PATTERN,
                                               cat_pattern=CAT_PATTERN)
        # since `nli` already has all the information (text and ids)
        # you don't need to iterate over it multiple times so
        # construct `X` and `y` in one go.
        X = []
        y = []
        for fileid in nli.fileids():
            X.append({
                'text': nli.raw(fileid),
                'id': fileid.split('/')[-1].split('.')[0]
            })
            y.append(nli.categories(fileid)[0])
        clf = PCA(n_components=2)
        model = build_and_evaluate(X, y, clf, speech)
Пример #20
0
import os
import re
import csv

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

srm_data_dir = '/home/mayank/work/projects/SRM/Data'

training_file_path = os.path.join(srm_data_dir, 'TrainingData.csv')
training_file = open(training_file_path, 'r')

root_dir = os.path.join(srm_data_dir, 'sub_data')

# noraml_reader = PlaintextCorpusReader(root = root_dir,
#                                       fileids = ['Financial.csv'])

cat_map_ = {
    'Compliance': 'Compliance.csv',
    'Financial': 'Financial.csv',
    'Operational': 'Operational.csv',
    'Strategic': 'Strategic.csv'
}

cat_reader = CategorizedPlaintextCorpusReader(root=root_dir,
                                              fileids=r'$.csv',
                                              cat_map=cat_map_)
Пример #21
0
import re
from sklearn import metrics
import string
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from poslemma import LemmatizationWithPOSTagger
from ordinal_classification import OrdinalSVC
from sklearn.model_selection import StratifiedKFold

base_dir = '/Users/ja/Documents/www'
corpus_name = 'corpus'

min_topics = 10
max_topics = 60

corpus = CategorizedPlaintextCorpusReader(
    os.path.join(base_dir, corpus_name), fileids=r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*')


def clean(doc):
    lemma = LemmatizationWithPOSTagger()
    stop = set(stopwords.words('english') + stopwords.words('numbers'))
    exclude = set(string.punctuation)
    wordchars = set(string.ascii_letters)
    wordchars |= set(string.digits)

    def contains_any(str, set):
        """Check whether 'str' contains ANY of the chars in 'set'"""
        return 1 in [c in str for c in set]

    def is_number(s):
        try:
Пример #22
0
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('corpus/text',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids('2019'))