示例#1
0
def demo(**kwargs):
    import nltk
    from nltk_contrib.coref import NLTK_COREF_DATA
    from nltk_contrib.coref.muc import muc6_documents, muc7_documents
    from nltk_contrib.coref.muc import MUCCorpusReader
    nltk.data.path.insert(0, NLTK_COREF_DATA)
    muc6 = LazyCorpusLoader('muc6/', MUCCorpusReader, muc6_documents)
    for sent in muc6.iob_sents()[:]:
        for word in sent:
            print word
        print
    print
    for sent in muc6.mentions(depth=None):
        for mention in sent:
            print mention
        if sent: print
    print
    muc7 = LazyCorpusLoader('muc7/', MUCCorpusReader, muc7_documents)
    for sent in muc7.iob_sents()[:]:
        for word in sent:
            print word
        print
    print
    for sent in muc7.mentions(depth=None):
        for mention in sent:
            print mention
        if sent: print
    print
示例#2
0
def new_wordnet_instance():
    """
    Create a new wordnet instance. This is usefult for parallel workflows.
    Multiple processes cannot access the same wordnet instance (as when imported
    globally with `from wordnet.corpus import wordnet`). This is due nltk not
    being thread-safe.
    """
    return LazyCorpusLoader(
        'wordnet', WordNetCorpusReader,
        LazyCorpusLoader('omw', CorpusReader,
                         r'.*/wn-data-.*\.tab', encoding='utf8')
    )
示例#3
0
文件: DealCorpus.py 项目: LM1201/ABA
    def loaddiff(self):
        corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
        ##中文目录乱码
        corpus_root = unicode(corpus_root, "GB2312")
        self.logger.info(corpus_root)
        pattern_1 = r".*/diff1/.*\.txt"
        pattern_2 = r".*/diff2/.*\.txt"
        pattern_3 = r".*/diff3/.*\.txt"

        self.logger.info("加载语料库 lazyload")
        self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1)
        self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2)
        self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3)
        self.logger.info("加载语料库 完毕")
示例#4
0
def make_classifier():
    positive_file = 'positive_tweets.json'
    negative_file = 'negative_tweets.json'
    files = [positive_file, negative_file]

    twitter_samples = LazyCorpusLoader('twitter_samples',
                                       TwitterCorpusReader,
                                       files,
                                       word_tokenizer=CustomTokenizer())

    #this returns a list of lists
    twitter_tokens = twitter_samples.tokenized()

    #need to unpack our list of lists, using a nested list comprehension
    frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub)
    fequency_dist.pprint(100)
    master_list_of_words = tuple(requency_dist.keys())
    extraction_function = make_extract_features_func(master_list_of_words)

    positive_tokens = twitter_samples.tokenized(positive_file)
    negative_tokens = twitter_samples.tokenized(negative_file)

    poistive_tokens = [(token, 'positive') for token in positive_tokens]
    negative_tokens = [(token, 'negative') for token in negative_tokens]

    all_tokens = positive_tokens + negative_tokens
    random.shuffle(all_tokens)

    training_set = nltk.classify.apply_features(extraction_function,
                                                all_tokens)

    classifier = NaiveBayesClassifier.train(training_set)

    return classifier, master_list_of_words
示例#5
0
    def build_terms(self, terms):
        # save the original corpus
        corpus_temp = terms.kwargs["corpus"]
        groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)',
                          corpus_temp.root.path)
        terms.kwargs["corpus"] = LazyCorpusLoader(
            "c50_tags/" + groups.group(1),
            CategorizedPlaintextCorpusReader,
            r'.+/.+',
            cat_pattern=r'(.+)/.+')

        print "In ModeWeightClassCollocationPOS"
        cache_file = "%s.dat" % terms.name
        terms.tokens = []
        shelf = shelve.open(cache_file, protocol=2)

        for author in terms.kwargs["corpus"].categories():

            author_files = set(terms.kwargs["corpus"].fileids([author])) & set(
                terms.kwargs["source"])
            author_files = list(author_files)
            if len(author_files) == 0:
                continue

            author_files.sort()
            #print "str(author_files): " + str(author_files)
            #print "str(terms.kwargs["corpus"]): " + str(terms.kwargs["corpus"]) + " str(terms.kwargs["corpus"].fileids([author])): " + str(terms.kwargs["corpus"].fileids([author])) + " str(terms.kwargs[\"source\"]): " + str(terms.kwargs["source"])
            f_srcs = "|".join(author_files)

            terms.kwargs["string"] = \
            terms.kwargs["corpus"].raw(fileids=author_files).lower()

            if f_srcs in shelf and terms.kwargs["lazy"]:
                terms.tokens += shelf[f_srcs]
                #print(str(f_src))
                #print("%s ... Found in \"%s\"" % (f_src, cache_file))
            else:
                terms.kwargs["string"] = \
                terms.kwargs["corpus"].raw(fileids=author_files).lower()

                temp_tokens = terms.calc_terms()

                # because the latter function calc:terms get off this option,
                # but we still needed
                terms.kwargs["boolBuildSetGlobal"] = True
                terms.kwargs["mode"] = EnumModes.MODE_CORPUS_POS_GLOBAL_A
                ###############################################################

                terms.tokens += temp_tokens

                if terms.kwargs["lazy"]:
                    shelf[f_srcs] = temp_tokens

                #print ("%s ... Recalculated in \"%s\"" % (f_src, cache_file))
        terms.kwargs["boolBuildSetGlobal"] = False
        terms.kwargs["mode"] = EnumModes.MODE_CORPUS
        shelf.close()

        # restore the original corpus
        terms.kwargs["corpus"] = corpus_temp
示例#6
0
文件: chasen.py 项目: yuanlanda/nltk
def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")

    assert isinstance(jeita.tagged_words()[0][1], str)
示例#7
0
def load_treebank(sections):
    treebank_path = os.environ.get('NLTK_TREEBANK', 'treebank/combined')
    treebank = LazyCorpusLoader(
        treebank_path,
        BracketParseCorpusReader, 
        r'(%s\/)?wsj_%s.*\.mrg' % (sections, sections))
    return treebank
示例#8
0
class LangDetect(object):
    language_trigrams = {}
    langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')

    def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang + "-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])

    def detect(self, text):
        '''
        Detect the text's language
        '''
        words = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1
        total = sum(trigrams.values())
        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]

    def get_word_trigrams(self, match):
        return [
            ''.join(trigram) for trigram in nltk_trigrams(match)
            if trigram != None
        ]
示例#9
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print(knbc.fileids()[:10])
    print(''.join(knbc.words()[:100]))

    print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
    ).encode('utf-8')

    print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))

    print(
        '\n'.join(
            ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )
示例#10
0
def ham_corpus_maker(outpath, word):
    corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader,
                              r'(?!\.).*\.xml')
    outfile = codecs.open(outpath, 'w', 'utf-8')
    count = 0
    instancenum = 0
    targetwordnum = 0
    for file in corpus.fileids():
        #print file

        for doc in corpus.xml(file).getchildren():

            # print doc.getchildren()
            cat = doc.getchildren()[3].text  #
            text = doc.getchildren()[5].text
            newtext = correctPersianString(text)
            newtext = newtext.replace('\n', ' ')
            textlines = newtext.split('.')
            if word in newtext.split():
                print newtext
                outfile.write(newtext)
                outfile.write('\n')
                print
                print

    print str(instancenum) + " seeds found "
    print str(targetwordnum) + " target word found "

    outfile.close()
示例#11
0
def ClassifierModel():
    positive_file = 'positive_tweets.json'
    negative_file = 'negative_tweets.json'
    files = [positive_file, negative_file]
    twitter_samples = LazyCorpusLoader('twitter_samples',
                                       TwitterCorpusReader,
                                       files,
                                       word_tokenizer=CustomTokenizer())

    #this returns a list of lists
    twitter_tokens = twitter_samples.tokenized()

    #need to unpack the list of lists using nested list
    frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub)
    frequency_dist.pprint(200)

    master_list_of_words = tuple(frequency_dist.keys())
    extraction_function = feature_extraction(master_list_of_words)
    positive_tokens = twitter_samples.tokenized(positive_file)
    negative_tokens = twitter_samples.tokenized(negative_file)
    positive_tokens = [(token, 'positive') for token in positive_tokens]
    negative_tokens = [(token, 'negative') for token in negative_tokens]
    all_tokens = positive_tokens + negative_tokens
    random.shuffle(all_tokens)
    #creating training set
    training_set = nltk.classify.apply_features(extraction_function,
                                                all_tokens)

    #creating a classifier bt calling the train method
    classifier = NaiveBayesClassifier.train(training_set)

    return classifier, master_list_of_words
示例#12
0
def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs):
    if corpus == 'timit':
        return LazyCorpusLoader('timit',
                                NumberedTaggedSentCorpusReader,
                                '.+\.tags',
                                tag_mapping_function=simplify_wsj_tag)

    real_corpus = getattr(nltk.corpus, corpus, None)

    if not real_corpus:
        if not reader:
            raise ValueError('you must specify a corpus reader')

        if not fileids:
            fileids = '.*'

        root = os.path.expanduser(corpus)

        if not os.path.isdir(root):
            if not corpus.startswith('corpora/'):
                path = 'corpora/%s' % corpus
            else:
                path = corpus

            try:
                root = nltk.data.find(path)
            except LookupError:
                raise ValueError('cannot find corpus path for %s' % corpus)

        reader_cls = import_attr(reader)
        real_corpus = reader_cls(root, fileids, **kwargs)

    return real_corpus
示例#13
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')

    assert isinstance(jeita.tagged_words()[0][1], compat.string_types)
示例#14
0
def test():

    from nltk.corpus.util import LazyCorpusLoader
    knbc = LazyCorpusLoader(
        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
    assert isinstance(knbc.words()[0], string_types)
    assert isinstance(knbc.sents()[0][0], string_types)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
示例#15
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
示例#16
0
文件: GetCorpus.py 项目: LM1201/ABA
    def loadcorpus(self):
        corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
        ##中文目录乱码
        corpus_root = unicode(corpus_root, "GB2312")
        self.logger.info(corpus_root)

        pattern_1 = r".*/diff1/.*\.txt"
        pattern_2 = r".*/diff2/.*\.txt"
        pattern_3 = r".*/diff3/.*\.txt"
        from nltk.corpus.util import LazyCorpusLoader
        from nltk.corpus import PlaintextCorpusReader
        self.logger.info("加载语料库")
        self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_1)
        self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_2)
        self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_3)
        self.logger.info("加载完毕")
示例#17
0
    def calc_terms(self, kwargs, f_src):
        # save the original corpus
        corpus_temp = kwargs["corpus"]

        groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path)
        kwargs["corpus"] = LazyCorpusLoader("c50_term_SFM_23/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+')

        sfm_terms = Util.calc_SFM(kwargs["corpus"].raw(fileids=[f_src]))

        # restore the original corpus
        kwargs["corpus"] = corpus_temp
        return sfm_terms
示例#18
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader("knbc/corpus1",
                            KNBCorpusReader,
                            r".*/KN.*",
                            encoding="euc-jp")
    assert isinstance(knbc.words()[0], str)
    assert isinstance(knbc.sents()[0][0], str)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
示例#19
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
    print('/'.join( jeita.words()[22100:22140] ))


    print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
                          for sent in jeita.tagged_sents()[2170:2173]))
示例#20
0
class LangDetectTwitter(ModifiedMRJob):
    DEFAULT_INPUT_PROTOCOL = 'raw_value'
    language_trigrams = {}
    langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')

    def configure_options(self):
        super(LangDetectTwitter, self).configure_options()
        #self.add_file_option('--langs', default='languages.txt')

        #def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
    def __init__(self, *args, **kwargs):
        super(LangDetectTwitter, self).__init__(*args, **kwargs)
        #languages = [x.strip() for x in open(self.options.langs, 'r').readlines()]
        languages = [
            'fr', 'en', 'ar', 'es', 'de', 'it', 'id', 'pt', 'tr', 'ru', 'nl',
            'hi', 'sv', 'fi', 'da', 'pl', 'hu', 'fa', 'he', 'ur', 'th'
        ]
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang + "-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])

    def mapper(self, key, tweet):
        '''
		Detect the text's language
		'''
        obj = cjson.decode(tweet)
        text = obj['tx']
        words = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1
        total = sum(trigrams.values())
        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))
        obj['lang'] = sorted(scores.items(), key=lambda x: x[1],
                             reverse=True)[0][0]
        yield key, obj

    def get_word_trigrams(self, match):
        return [
            ''.join(trigram) for trigram in nltk_trigrams(match)
            if trigram != None
        ]
示例#21
0
		def __init__(self, languages=LangIDDict().keys()):
		
			self.language_trigrams = {}
			self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt')
			
			for lang in languages:
				self.language_trigrams[lang] = FreqDist()
				for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
					self.language_trigrams[lang].inc(f[0], f[1])
				self.language_dicts = dict([
					(id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()]))
					for id, fdist in self.language_trigrams.items()
				])
示例#22
0
    def corpus(self):
        """
            This method is used to initialize the corpus object if it wasn't before
        """
        if self._corpus is None:
            # The use of r'(?!\.).*\.txt' and =r'(neg|pos)/.*' makes possible to find the files labeled with neg and pos
            self._corpus = LazyCorpusLoader(self._corpusName,
                                            CategorizedPlaintextCorpusReader,
                                            r'(?!\.).*\.txt',
                                            cat_pattern=r'(neg|pos)/.*',
                                            encoding='ascii')

        return self._corpus
示例#23
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita",
                             ChasenCorpusReader,
                             r".*chasen",
                             encoding="utf-8")
    print("/".join(jeita.words()[22100:22140]))

    print("\nEOS\n".join("\n".join("{}/{}".format(w[0], w[1].split("\t")[2])
                                   for w in sent)
                         for sent in jeita.tagged_sents()[2170:2173]))
示例#24
0
def load_data():
    abc = LazyCorpusLoader(
        "abc",
        PlaintextCorpusReader,
        r"(?!\.).*\.txt",
        encoding=[("science", "latin_1"), ("rural", "utf8")],
    )

    raw = abc.sents()
    sentences = []

    stopwords_ = list(stopwords.words('english'))
    final_stopwords = {w: 1 for w in stopwords_}

    for s in raw:
        words = []
        for w in s:
            if w.isalpha() and w not in final_stopwords:
                words.append(w.lower())
        sentences.append(words)

    word_counts = defaultdict(int)
    for sentence in sentences:
        for word in sentence:
            word_counts[word] += 1

    vocabulary = list(word_counts.keys())
    vocabulary.extend(["<START>", "<END>"])
    vocab_size = len(vocabulary)
    word_to_num = {word: n for n, word in enumerate(vocabulary)}
    num_to_word = {n: word for n, word in enumerate(vocabulary)}

    sums = [-2, -1, 1, 2]
    training_data = []
    for sentence in tqdm(sentences):
        length = len(sentence)
        for cur_index in range(length):
            cur_word = sentence[cur_index]
            context_vector = []
            for diff in sums:
                index = cur_index + diff
                if index >= 0 and index < length:
                    context_word = sentence[index]
                    context_vector.append(context_word)
            if len(context_vector) == 4:
                training_data.append([context_vector, cur_word])

    return vocab_size, vocabulary, word_to_num, num_to_word, training_data
示例#25
0
def read_knbc(train_file, test_file, reference_file):

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    knbc = LazyCorpusLoader('knbc/corpus1',
                            KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort),
                            encoding='euc-jp')

    sentences = knbc.sents()

    write_train(sentences[0:4000], train_file)
    write_test(sentences[4000:-1], test_file)
    write_reference(sentences[4000:-1], reference_file)
示例#26
0
def treebank_tagger_demo():
    from nltk.corpus.util import LazyCorpusLoader    
    from nltk.corpus.reader import PlaintextCorpusReader
    from nltk_contrib.coref.util import TreebankTaggerCorpusReader
    
    state_union = LazyCorpusLoader(
        'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt')
    state_union = TreebankTaggerCorpusReader(state_union)
    
    print 'Treebank tagger demo...'
    print 'Tagged sentences:'
    for sent in state_union.tagged_sents()[500:505]:
        print sent
        print
    print
    print 'Tagged words:'
    for word in state_union.tagged_words()[500:505]:
        print word
    print
示例#27
0
def treebank_chunk_tagger_demo():
    from nltk.corpus.util import LazyCorpusLoader    
    from nltk.corpus.reader import PlaintextCorpusReader
    from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader
    
    state_union = LazyCorpusLoader(
        'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt')
    state_union = TreebankChunkTaggerCorpusReader(state_union)

    print 'Treebank chunker demo...'
    print 'Chunked sentences:'
    for sent in state_union.chunked_sents()[500:505]:
        print sent
        print
    print
    print 'Parsed sentences:'
    for tree in state_union.parsed_sents()[500:505]:
        print tree
        print
    print
示例#28
0
def hamshahri_targetword_corpus_maker(match, outpath):
    print 'loading hamshahri corpus'
    print
    corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader,
                              r'(?!\.).*\.xml')
    outfile = codecs.open(outpath, 'w', 'utf-8')
    punclist = [u'،', u'؛', u':', u'؟', u'#']

    matchnum = 0
    count = 0
    print 'creating target corpus'
    for file in corpus.fileids():
        #print file

        for doc in corpus.xml(file).getchildren():

            #    print doc.getchildren()
            #          cat=doc.getchildren()[3].text#
            text = doc.getchildren()[5].text
            newtext = correctPersianString(text)
            newtext = newtext.replace('\n', ' ')

            for item in punclist:
                if item in newtext:
                    newtext = newtext.replace(item, '')
    #
    #        #  print newtext
    #
    #
            if match in newtext.split():
                #
                matchnum += 1
                print newtext
                print '#'
                count += 1
                #
                outfile.write(newtext)
                outfile.write('ALI')

    outfile.close()
    print count
示例#29
0
    def build_terms(self, terms):
        # save the original corpus
        corpus_temp = terms.kwargs["corpus"]
        groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)',
                          corpus_temp.root.path)
        terms.kwargs["corpus"] = LazyCorpusLoader(
            "c50_tagged/" + groups.group(1),
            CategorizedPlaintextCorpusReader,
            r'.+/.+',
            cat_pattern=r'(.+)/.+')

        #rint "In ModeCorpusPOS"
        cache_file = "%s.dat" % terms.name
        terms.tokens = []
        shelf = shelve.open(cache_file, protocol=2)

        for f_src in terms.kwargs["source"]:

            if f_src in shelf and terms.kwargs["lazy"]:
                terms.tokens += shelf[f_src]
                #print(str(f_src))
                #print("%s ... Found in \"%s\"" % (f_src, cache_file))
            else:
                terms.kwargs["string"] = \
                terms.kwargs["corpus"].raw(fileids=[f_src]).lower()

                temp_tokens = terms.calc_terms()
                terms.tokens += temp_tokens

                if terms.kwargs["lazy"]:
                    shelf[f_src] = temp_tokens

                #print ("%s ... Recalculated in \"%s\"" % (f_src, cache_file))
        shelf.close()

        # restore the original corpus
        terms.kwargs["corpus"] = corpus_temp
class LangDetect(object):
    language_trigrams = {}
    langid = LazyCorpusLoader('langid',
                              LangIdCorpusReader,
                              r'(?!\.).*\.txt',
                              encoding='utf-8')
    tk = CharGramTokenizer()

    def __init__(self, languages):
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang + "-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])

    def detect(self, text):
        '''
        Detect the text's language
        '''
        if not isinstance(text, unicode):
            raise Exception('not unicode')

        trigrams = self.tk.tokenize(text)

        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
        total = sum(trigrams.values())

        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))
        best_match = sorted(scores.items(), key=lambda x: x[1],
                            reverse=True)[0]
        if best_match[1] == 0:
            return ('other', 0)
        else:
            return best_match