Exemplo n.º 1
0
def tag_files(direc):

    with open('nltk_german_classifier_data.pickle', 'rb') as t:
        tagger = pickle.load(t)

    for subdir, dirs, files in os.walk(direc):
        for file in files:
            if ((file.endswith('.txt')) and (not ("nounlist" in file))):
                filename = os.path.join(direc, file)
                wordlist = PlaintextCorpusReader(direc, '.*')
                name = (filename.split("/"))[-1]
                #to_tag = wordlist.words(filename)
                to_tag = wordlist.words(name)

                tagged = tagger.tag(to_tag)
                name = name.replace(".txt", "")
                p = filename.split("/")
                path = p[0] + "/" + p[1] + "/" + p[2] + "/" + p[3] + "/tagged/"

                with open('%stagged_%s_data.pickle' % (path, name), 'wb') as f:
                    pickle.dump(tagged, f)

                nouns = []
                for word in tagged:
                    if word[1] == 'NN':
                        nouns.append(word[0])

                path = path.replace("tagged", "nouns")
                with open('%snoun-list_%s_data.pickle' % (path, name),
                          'wb') as f:
                    pickle.dump(nouns, f)

                #count noun frequency:
                noun_frequency = Counter(nouns)
                with open('%snoun-frequ_%s_data.pickle' % (path, name),
                          'wb') as f:
                    pickle.dump(noun_frequency, f)

                with open('%snounlist_%s.txt' % (path, name), 'w') as f:
                    f.write("\n".join(nouns))
Exemplo n.º 2
0
def give(filename):
    corpus_root = '/home/helios/Desktop/easclepius/easclepius'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')

    name = "/home/helios/Desktop/easclepius/easclepius/" + filename
    print(filename)
    print(name)
    print(
        "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    textwords = [w.lower() for w in wordlists.words(name)]
    print(textwords)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(textwords)

    finder.apply_freq_filter(2)
    ignored_words = set(stopwords.words('english'))
    finder.apply_word_filter(lambda w: len(w) < 3 or w in ignored_words)
    a = finder.nbest(bigram_measures.likelihood_ratio, 5)
    fd = FreqDist(a)
    print("fd")
    return a
Exemplo n.º 3
0
def get_pos_features(dataset, feature_set_file, output_file):
    corpus_root = '/home1/c/cis530/data-hw2/' 
    dataset_path = corpus_root + dataset
    files = PlaintextCorpusReader(dataset_path, '.*')
    ids = files.fileids()
    feature_set_tuples = get_feature_set_tuples(feature_set_file)
    out_file = open(output_file,'w')
    ## Off by one error here, don't use range
    for i in range(len(ids)):
        out_string= ''
        current_file = dataset + '/'+ids[i]
        e = current_file.split('/')
        out_string = out_string + current_file+ ' '+e[-2]
        tagged_file = nltk.pos_tag(files.words(ids[i]))
        for feature in feature_set_tuples:
            count =0
            for tag in tagged_file:
                if(feature == tag):
                    count = count +1
            out_string = out_string + " " + feature[1]+feature[0]+ ':' + str(count)
        out_file.write(out_string + '\n')
        out_file.flush()
Exemplo n.º 4
0
    def tag(self, path, files):
        corpus = PlaintextCorpusReader(path, files)
        count = 1
        taggedDocs = {}
        nes = {}
        for f in files:
            print("Tagging file " + f + ", " + str(count) + "/" +
                  str(len(files)))
            count += 1

            chunked = self.process(corpus.raw(f))
            for i in range(len(chunked)):
                for j in range(len(chunked[i])):
                    chunked[i][j] = self.neTagSentence(chunked[i][j])
            taggedDocs[f] = chunked

            foundNes = []
            for split in chunked:
                for sentence in split:
                    foundNes += self.getNesFromTree(sentence)
            nes[f] = foundNes
        return (taggedDocs, nes)
    def extractWordsOnly(self, article):
        templist = []
        listtextstring = []
        articlename = article + '.txt'
        #corpus_root = '/home/jesal/onedump/'
        wl = PlaintextCorpusReader(corpus_root, '.*')
        allwords = wl.words(fileids=articlename)
        exturllist = self.extractexternalURL(article)
        textstring = wl.raw(articlename)
        for item in exturllist:
            textstring = textstring.replace(item, ' ')

        #templist = re.sub(r'[.!,;?]', ' ', textstring).split()
        templist = nltk.word_tokenize(textstring)
        listtemp = []
        for i in templist:
            j = re.sub('[^A-Za-z]+', '', i)
            listtemp.append(str(j))

        templistfinal = []
        templistfinal = self.removeEmpty(listtemp)
        return templistfinal
Exemplo n.º 6
0
def generate_custom_lists(project_dir):
    """ Same as generate_existing_lists, but on custom corpora that reside in
    root/custom_corpora. Write a pickle file for each corpus. 
    """
    from nltk.corpus import PlaintextCorpusReader
    import os

    for dir in os.scandir("{}/custom-corpora/".format(project_dir)):
        if dir.is_dir():
            print("generating list for custom corpus '{}'".format(dir.name))
            custom_word_tags = defaultdict(set)
            custom_corpus = PlaintextCorpusReader(
                root="{}/custom-corpora/{}/".format(project_dir, dir.name),
                fileids=".*")

            # tokenize and tag sentences
            try:
                tags = get_tags_sentence(list(custom_corpus.sents()))
            except ValueError:
                print("No sentences found for corpus '{0}'. Did you place your"
                      " text files inside of /custom-corpora/{0}/ ?".format(
                          dir.name))
            else:
                for tag in tags:
                    custom_word_tags[tag[-1]].update([tag[0]])

            # write results, dump .pkl regardless if empty
            with open(
                    "{}/pre-generated-lists/custom_word_tags_{}.pkl".format(
                        project_dir, dir.name), "wb") as outfile:
                pickle.dump(custom_word_tags, outfile)

            print("Completed dumping of `{}` custom corpus. {} total words "
                  "saved".format(
                      dir.name,
                      sum([
                          len(values) for values in custom_word_tags.values()
                      ])))
Exemplo n.º 7
0
def LDA_pretreatment(filename):
    # read words from files
    corpus_root = filename
    file_pattern = r".*"
    ptb = PlaintextCorpusReader(corpus_root, file_pattern)
    # remove the words whose lenth>6
    initialwords = [[w for w in ptb.words(fileid) if len(w) > 6]
                    for fileid in ptb.fileids()]
    # remove the words that were spelled wrong
    spellcheckedwords = [[w for w in document if d.check(w)]
                         for document in initialwords]
    # translate to low litter
    lowerwords = [[w.lower() for w in document]
                  for document in spellcheckedwords]
    #
    wnl = nltk.WordNetLemmatizer()
    rectifywords = [[wnl.lemmatize(s) for s in document]
                    for document in lowerwords]
    # remove the stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    finitialwords = [[w for w in document if w not in stopwords]
                     for document in rectifywords]
    return finitialwords
Exemplo n.º 8
0
 def print_my_count(self, corpus, patt, n):
     wordlists = PlaintextCorpusReader(corpus, patt)
     fileids = wordlists.fileids()
     for id in fileids:
         words = wordlists.words(id)
         wordt = len(words)
         wordc = len(set(words))
         wor = "=> corpus tokens : " + ` wordt `
         dis = "=> distinct token types : " + ` wordc `
         ric = "=> ind lex richness : " + ` wordt / wordc `
         print "********************************************"
         print "1. Corpus parameters", "(= " + id + ")"
         print "********************************************"
         print dis
         print ric
         print wor
         print "********************************************"
         fre = FreqDist(word.lower() for word in words if word.isalpha())
         print "2. Top 100 words"
         print "********************************************"
         for word in fre.keys()[:n]:
             t = word, ` fre[word] ` + "/" + ` wordt `
             print t
Exemplo n.º 9
0
def read_corpus(path, pattern):
    cr = PlaintextCorpusReader(path, pattern)
    for fid in cr.fileids():
        code = cr.raw(fid)
        try:
            tokens = list(tokenize.generate_tokens(io.StringIO(code).readline))
            tokens = [
                token for t in tokens
                if t.type == tokenize.NAME or t.type == tokenize.OP
                for token in t.string.split(" ")
            ]
            if tokens:
                # print("Task: %s; Color: %s" % (programming_task.name, color_val))
                yield TaggedDocument(tokens, fid)
        except tokenize.TokenError as e:
            # print("%s: %s" % (type(e).__name__, e))
            pass
        except IndentationError as e:
            # print("%s: %s" % (type(e).__name__, e))
            pass
        except Exception as e:
            print("%s: %s" % (type(e).__name__, e))
            pass
Exemplo n.º 10
0
 def my_bar(self,corpus,patt,n):
     wordlists = PlaintextCorpusReader(corpus,patt)
     fileids = wordlists.fileids()
     k = len(fileids)
     figA = pylab.figure(1)
     figB = pylab.figure(2)
     li = ['Corpus']
     for id in fileids:
         if k > 1:
             i = fileids.index(id)+1
             words = wordlists.words(id)
             fre = FreqDist(word.lower() for word in words if word.isalpha())
             self.bar_count(fre,n,figA,2*k,2*i,id,li)
             self.bar_freq(fre,n,figB,2*k,2*i,id,li)
             figA.savefig('..data/complex-freq.pdf')
             figB.savefig('..data/complex-relfreq.pdf')
         else:
             words = wordlists.words(id)
             fre = FreqDist(word.lower() for word in words if word.isalpha())
             self.bar_count(fre,n,figA,k,1,id,li)
             self.bar_freq(fre,n,figB,k,1,id,li)
             figA.savefig('../data/simple-freq.pdf')
             figB.savefig('../data/simple-relfreq.pdf')             
     pylab.show()
Exemplo n.º 11
0
    def __init__(self, dname, sfile=None, stemmer=None):
        if not os.path.isdir(os.path.abspath(dname)):
            raise FileExistsError('invalid directory!')

        if sfile == "":
            self.stop_words = ()
        elif sfile is None:
            self.stop_words = set(nltk.corpus.stopwords.words('english'))
        else:
            if not os.path.exists(sfile):
                raise FileExistsError('invalid file!')
            else:
                reader = PlaintextCorpusReader(str(os.getcwd()), sfile)
                self.stop_words = set(reader.words([
                    sfile,
                ]))

        self.root = os.path.abspath(dname)
        if isinstance(stemmer, nltk.stem.porter.PorterStemmer) or \
            isinstance(stemmer, nltk.stem.snowball.SnowballStemmer) or \
                stemmer is None:
            self.stemmer = stemmer
        else:
            raise Exception('invalid stemmer')
Exemplo n.º 12
0
    def __init__(self,
                 corpus,
                 tf="raw",
                 idf="base",
                 stopword=nltk.corpus.stopwords.words('english'),
                 stemmer=PorterStemmer(),
                 ignorecase="yes"):

        self.corpus = corpus
        self.tf_key = tf
        self.idf_key = idf

        # Setup stop words
        if stopword == "none":
            self.stop_words = ()
        elif stopword is None or stopword == nltk.corpus.stopwords.words(
                'english'):
            self.stop_words = set(nltk.corpus.stopwords.words('english'))
        else:
            if not os.path.exists(stopword):
                raise FileExistsError('Invalid stopword file!')
            else:
                reader = PlaintextCorpusReader(str(os.getcwd()), stopword)
                self.stop_words = set(reader.words([
                    stopword,
                ]))

        self.stemmer = stemmer
        self.ignorecase = ignorecase

        # Setup caching self variables
        self.dim = set()
        self.tf = {}
        self.idf = {}
        self.tf_idf_res = {}
        self.words_dict = {}
Exemplo n.º 13
0
 def read(self):
     portuguese_sent_tokenizer = nltk.data.load(
         "tokenizers/punkt/portuguese.pickle")
     newcorpus = PlaintextCorpusReader(
         os.path.join(self.s.path, "leis"),
         ".*",
         sent_tokenizer=portuguese_sent_tokenizer)
     ponctuation = [
         ".", "!", "-", "?", ",", "lei", "artigo", ")", "(", "subchefia",
         ":", "presidente", "$", "°"
     ]
     #print(newcorpus.words())
     for files in newcorpus.fileids():
         words = newcorpus.words(files)
         words = [w.lower() for w in words]
         filtered_words = [
             word for word in words
             if word not in stopwords.words("portuguese")
         ]
         filtered_words = [
             word for word in filtered_words if word not in ponctuation
         ]
         fd1 = nltk.FreqDist(filtered_words)
         print(files, fd1.most_common(10))
Exemplo n.º 14
0
    def occStats(self, path, format, list, plotting):
        wordlists = PlaintextCorpusReader(path, format)
        fileids = wordlists.fileids()
        k = len(fileids)

        # computing rel frequencies
        self.fileStats(path, fileids)

        # closing threads
        #self.pool.close()
        #self.pool.join()

        # plotting vars
        figname = "Base GQs"
        figpath = plotting + '/' + figname.replace(' ', '-') + '-stats.pdf'
        savpath = plotting + '/' + figname.replace(' ', '-')

        # plotting
        MyPlot(self.stats, self.classstats, figname, "one", plotting,
               list)  # all

        # generating report
        SaveStats(self.classstats, self.stats, figpath, savpath,
                  plotting)  # all
Exemplo n.º 15
0
###################Codificacao correta dos caracteres######################
#coding: utf-8

###########################################################################
#######################Bibliotecas nencessárias############################
import nltk
from nltk.corpus import PlaintextCorpusReader
import os

###########################################################################
######################Configuração inicial#################################
os.chdir('/home/joaopedropp/Uptpt/')
A = os.listdir('/home/joaopedropp/Uptpt/')
textos = '/home/joaopedropp/Uptpt/'
t = PlaintextCorpusReader(textos, '.*')

###########################################################################
######################Vetores de Classificacao#############################
Tele = [
    "dsl", "amps", "analógico", "amplificador", "ansi", "antena", "banda",
    "broadband", "cdma", "comutação", "discagem", "dsl", "dtmf", "erlang",
    "espectro", "rádio-base", "frequência", "gsm", "hertz", "interferência",
    "isdn", "largura de banda", "modulação", "qpsk", "multimÍdia",
    "multiplexador", "pcm", "propagação", "roaming", "ruído", "sinalização",
    "tdd", "tdm", "tdma", "telefonia fixa", "canal", "viva-voz", "voip"
]

Eletro = [
    'amperímetro', 'capacitância', 'capacitor', 'circuito', 'tensão', 'smd',
    'corrente alternada', 'corrente contínua', 'diodo', 'eletromagnetismo',
    'fet', 'led', 'mosfet', 'multÍmetro', 'ohmímetro', 'potenciômetro',
Exemplo n.º 16
0
    def extract_data(self,
                     filepath,
                     ind_features=_PARAIND_FEAT,
                     dep_features=_PARADEP_FEAT,
                     labels_per_sent=None,
                     labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" +
                                           string.punctuation + "]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"

        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([
                numpy_filepath_pca,
        ]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup,
                     len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE,
                                          ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(
            self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)

        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn,
                         self.dictVectorizer.get_feature_names(),
                         arff_filepath,
                         filename + "_RAW",
                         labels_per_window,
                         file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)

        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(
                matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca,
                             feature_names,
                             arff_filepath_pca,
                             filename + "_PCA95",
                             labels_per_window,
                             file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)

            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
Exemplo n.º 17
0
# Text procesing file. Ingest and initial processing
# Project: Congressional Testimony Essay
# Ian P. Cook

import os, re, csv, string, operator
import nltk
from nltk.corpus import PlaintextCorpusReader
import time

start_time = time.time()
dir = '/Users/ian/Dropbox/Academia/Dissertation/testimony/hearings/112Congress/'

corpus_root = dir
hearings = PlaintextCorpusReader(corpus_root, '.*')

cfd = nltk.ConditionalFreqDist((target, fileid[-4:-9])
                               for fileid in hearings.fileids()
                               if os.path.getsize(fileid) > 0
                               for w in hearings.words(fileid)
                               for target in ['apologize', 'regret']
                               if w.lower().startswith(target))
print(time.time() - start_time)
cfd.plot()

# The above code works, with one exception: longer/multi-wor:d
# "targets" causes an error: "local variable 'legend-loc' referenced before
# assignment'. Not sure what that is about.

# Working on getting the filenames into a csv
# pass the function a directory and it should travers all the files
# and subfolders, read the filenames, and put them into the CSV's first column
# CALCULATION OF TF VALUES

# TF= (NO OF TIMES A TERM APPEAR IN SENTENCE) / (NO OF TERMS IN EACH SENTENCE)

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

corpus_root = 'C:\MyData\PythonPractice\Mycorpus'
wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt')

print('\nFollowing file ids are there in this corpus: \n ')
print(wordlists.fileids())
print("\nNumber of sentences in the file are :")
sencount = len(wordlists.sents(fileids=['resort.txt']))
print(sencount)
print('\n Sentences are : \n')
sentences = wordlists.sents(fileids='resort.txt')
print(sentences)
sample = wordlists.raw("resort.txt")
s = sample.split('.')

#   NUMBER OF TIMES A TERM APPEAR IN EACH SENTENCE

#   NUMBER OF TERMS IN  EACH SENTENCE

wordfreq = []
term_freq = []
terms_count_doc = []
# Module 3: Corpus
# Own Corpus
# Author: Dr. Alfred

from nltk.corpus import PlaintextCorpusReader
corpus_root = 'corpus' 
my_corpus = PlaintextCorpusReader(corpus_root, '.*')

# print(my_corpus.fileids())

fileid = 'file1.txt'

text = my_corpus.raw(fileid)
print(text)

print(" Num of chars :",len(my_corpus.raw(fileid)))
print(" Num of words :",len(my_corpus.words(fileid)))
print(" Num of sentences :",len(my_corpus.sents(fileid)))



# let's make our program compatible with Python 3.0/1/2/3
from __future__ import division, print_function
from future_builtins import ascii, filter, hex, map, oct, zip

search_word = 'samsung'  # one-word string for this program

import os  # operating system commands
import re  # regular expressions
import nltk  # draw on the Python natural language toolkit
from nltk.corpus import PlaintextCorpusReader
from numpy import *  # for array calculations

# create lists of positive and negative words using Hu and Liu (2004) lists
my_directory = '/Users/ngaonkar/Desktop/Predict542/final_project'
positive_list = PlaintextCorpusReader(my_directory,
                                      'Hu_Liu_positive_word_list.txt',
                                      encoding='latin-1')
negative_list = PlaintextCorpusReader(my_directory,
                                      'Hu_Liu_negative_word_list.txt',
                                      encoding='latin-1')
positive_words = positive_list.words()
negative_words = negative_list.words()


# define bag-of-words dictionaries
def bag_of_words(words, value):
    return dict([(word, value) for word in words])


positive_scoring = bag_of_words(positive_words, 1)
negative_scoring = bag_of_words(negative_words, -1)
        print(result)

        # 3b) If it is a NP, print out the text inside it
        print("Matching texts in NP chunks: ")
        for subtree in result.subtrees():
            if subtree.label() == "NP":
                subtree = list(map(lambda x: x[0], subtree.leaves()))
                subtree = " ".join(subtree)
                print(subtree)
        print()


if __name__ == '__main__':
    # Initialize Parser
    grammar = r"""
    NP: {< NNP > ∗}
    {< DT >? < JJ >? < NNS >} 
    {< NN >< NN >}
    """
    cp = nltk.RegexpParser(grammar)

    # Initalize corpus reader
    corpus_reader = PlaintextCorpusReader(root="./SpaceX", fileids=".*\.txt")

    # Tag all sentences
    sents = corpus_reader.sents("SpaceX.txt")
    tagged_sents = nltk.pos_tag_sents(sents)

    # Run parser on first five sentences
    run_parser(cp, tagged_sents[:5])
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')

    #reading the files
    corpus = PlaintextCorpusReader(in_dir, '.*')
    file_names_str = corpus.fileids()
    file_names = sorted(map(int, file_names_str))

    #Load corpus and generate the postings dictionary
    postings = defaultdict(dict)
    tokens = list()
    for docID in file_names:
        content = corpus.raw(str(docID))  # read file content
        content = preprocess(content)
        words = tokenize(content)  # tokenization: content -> words
        tokens = stemming(words)  # stemming

        if phrasal_query:
            token_len = defaultdict(list)
        else:
            token_len = defaultdict(int)
        # count the apeearing times of the token in the file
        term_pos = 0
        for token in tokens:
            if phrasal_query:
                if token in token_len.keys():
                    token_len[token][0] += 1
                    token_len[token][1].append(term_pos)
                else:
                    token_len[token] = [1, [term_pos]]
            else:
                token_len[token] += 1
            term_pos += 1
        '''
        Generate weighted token frequency.

        Generate dictionary of key -> token, value -> a dict with k,v 
        as file_name, weighted_token_frequency
        '''
        if phrasal_query:

            weighted_tokenfreq = normalize(
                [get_tf(y[0]) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][docID] = PhrasalToken(freq[0], freq[1], w_tf)
        else:

            weighted_tokenfreq = normalize(
                [get_tf(y) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][docID] = Token(w_tf)
    ''' 
    Output dictionary and postings files 

    - Dictionary file stores all the tokens, with their doc frequency, the offset 
    in the postings file.
    - Postings file stores the list of tuples -> (document ID, term freq).
    '''
    # write postings file
    dictionary = defaultdict(Entry)
    #print(postings.items())
    with open(out_postings, mode="wb") as postings_file:
        for key, value in postings.items():
            #print(value)
            '''
            len(value) := the document frequency of the token
                       := how many times the token appears in all documents
            offset := current writing position of the postings file
            '''
            offset = postings_file.tell()
            pickle.dump(value, postings_file)
            size = postings_file.write(pickle.dumps(value))
            dictionary[key] = Entry(len(value), offset, size)

    # write dictionary file
    with open(out_dict, mode="wb") as dictionary_file:
        pickle.dump(url_map, dictionary_file)
        pickle.dump(doc_id_map, dictionary_file)
        pickle.dump(pr_result, dictionary_file)
        pickle.dump(dictionary, dictionary_file)
        print("dictionary done")
'''
Statistics about a given corpus
'''
from math import sqrt
from nltk.corpus import PlaintextCorpusReader

#corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/personae/data_50"
#corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/fed_papers/F3"
corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/almedad/al3"
#corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/ansar1/an9"
#corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/blog_corpus/B2"
#corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/test/test1_64"

corpus = PlaintextCorpusReader(corpus_root, '.*txt', encoding="UTF-8")
n_texts = len(corpus.fileids())

txt_lengths = []
text_classes = []

DISTINCT_AUTHORS = True
TEXT_STATS = True

for text in corpus.fileids():

    if DISTINCT_AUTHORS:
        found_category = text.partition(".")[0]
        text_classes.append(found_category)

    if TEXT_STATS:
        wrd_tokens = corpus.words(text)
Exemplo n.º 24
0
# Retrieve a file list#
files = os.listdir(".")
print "All the files in the directory"
#Starts a loop that prints the name of each file, #
#with the condition that it ends with ".txt"#
for file in files:
    if file.endswith(".txt"):
        print file

file_name = raw_input("Choose the file:")

print "The file that was chosen is {0}".format(file_name)

from nltk.corpus import PlaintextCorpusReader
corpus_root = "."  # "." means the existing directory I am in
search_text = PlaintextCorpusReader(corpus_root, file_name)
search_text = nltk.Text(search_text.words())  #creates text object

keyword = raw_input("Specify word to search:")

search_text.concordance(keyword, 80, lines=30)

##NEW THING##
from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
search_text = PlaintextCorpusReader(corpus_root, file_name)
search_text = nltk.Text(search_text.words())

#
from nltk.corpus import stopwords
## path is andreaantenan/Desktop/cs195/nltk_data/corpora/stopwords/english.txt
Exemplo n.º 25
0
from sklearn.ensemble import RandomForestClassifier  #s-l随机森林分类器
from sklearn.neighbors import KNeighborsClassifier  #s-lKNN分类器
from sklearn.svm import SVC  #s-l支持向量机分类器
from sklearn import metrics  #计算评价指标
from sklearn import cross_validation  #划分训练集和测试集
from nltk.corpus import PlaintextCorpusReader
from gensim.models import word2vec
import numpy as np
from numpy import *

#加载自己的语料库
print('加载语料库...')
corpus_root_neg = r"E:\Strange\SRTP.11\NLP\data\ChnSentiCorp_htl_ba_6000\neg_pre6000"
corpus_root_pos = r"E:\Strange\SRTP.11\NLP\data\ChnSentiCorp_htl_ba_6000\pos_pre6000"

neg = PlaintextCorpusReader(corpus_root_neg, '.*')
pos = PlaintextCorpusReader(corpus_root_pos, '.*')

documents_neg = [(list(neg.words(fileid)), 0) for fileid in neg.fileids()]
documents_pos = [(list(pos.words(fileid)), 1) for fileid in pos.fileids()]
documents_neg.extend(documents_pos)
documents = documents_neg
random.shuffle(documents)  #随机打乱

sentences = word2vec.Text8Corpus(
    r"E:\Strange\SRTP.11\NLP\data\ChnSentiCorp_htl_ba_6000\merge\1000.txt"
)  #加载词向量训练语料
model = word2vec.Word2Vec(sentences, size=150, min_count=1)  #从零开始训练word2vec模型

# 增量训练
#print('加载模型...')
Exemplo n.º 26
0
import nltk
from nltk.corpus import PlaintextCorpusReader
mycorpus = PlaintextCorpusReader('.', '.*.txt')
mycorpus.fileids()
part2 = mycorpus.fileids()[1]
part2
part2string = mycorpus.raw('state_union_part2.txt')
part2tokens = nltk.word_tokenize(part2string)
part2tokens[:100]

len(part2string)
len(part2tokens)
alphapart2 = [w for w in part2tokens if w.isalpha()]
alphapart2[:100]
alphalowerpart2 = [w.lower() for w in alphapart2]
alphalowerpart2[:50]
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)
stopwords
stoppedalphalowerpart2 = [w for w in alphalowerpart2 if w not in stopwords]
from nltk import FreqDist
fdist = FreqDist(stoppedalphalowerpart2)
fdistkeys = list(fdist.keys())
fdistkeys[:50]
print('Printing top 50 words by frequency: ')
topkeys = fdist.most_common(50)
for pair in topkeys:
    print(pair)
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(alphalowerpart2)
Exemplo n.º 27
0
if op == '': op = '1'

dataset = int(op)

if dataset == 2:
    #articuloscientificos
    corpus_root = '/home/mguevara/Dropbox/DOCTORADO/TECNOLOGIA BUSQUEDA/Tarea1/SolucionconNLTK/articulos2'
    export_indices = '/home/mguevara/datasets/info/indices/articulos/'
    export_matrices = '/home/mguevara/datasets/info/matrices/articulos/'
    export_vocabularios = '/home/mguevara/datasets/info/vocabularios/articulos/'
    exp_archivos = '.*'
    termino_ejemplo = 'actor'  #'articulo857.txt'
    documento_ejemplo = '857'
    print_titulo("CREAR CORPUS")
    from nltk.corpus import PlaintextCorpusReader
    corpus = PlaintextCorpusReader(corpus_root, exp_archivos)

if dataset == 3:
    export_indices = '/home/mguevara/datasets/info/indices/reuters/'
    export_matrices = '/home/mguevara/datasets/info/matrices/reuters/'
    export_vocabularios = '/home/mguevara/datasets/info/vocabularios/reuters/'
    #exp_archivos='.*'
    termino_ejemplo = 'aguila'  #'articulo857.txt'
    documento_ejemplo = 'training/844'
    print_titulo("CREAR CORPUS")
    from nltk.corpus import reuters
    corpus = reuters

#reuters27000
#corpus_root = '/home/mguevara/datasets'
#exp_archivos='reuters/.*'
Exemplo n.º 28
0
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/sydshir/Desktop/Code'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
pillar = nltk.Text(wordlists.words('cityandthepillar.txt'))
pillar[:10]
pillar.concordance('gay')
list(nltk.bigrams(pillar))

#Frequency Distribution Calculator
fdist1 = FreqDist(pillar)
print(fdist1)
fdist1.most_common(50)
fdist1['gay']


#Bigrams Generator
def generate_model(cfdist, word, num=1):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()


text = nltk.Text(wordlists.words('cityandthepillar.txt'))
bigrams = nltk.bigrams(pillar)
cfd = nltk.ConditionalFreqDist(bigrams)
cfd['gay']
generate_model(cfd, 'gay')
Exemplo n.º 29
0
plt.title('Number of Unique Words', fontsize=20)

plt.subplot(1, 3, 3)
plt.barh(y_pos, data_wpm_sort.words_per_posts, align='center')
plt.yticks(y_pos, data_wpm_sort.Anio)
plt.title('Number of Words Per Posts', fontsize=20)

plt.tight_layout()
plt.show()

#Frecuencia de palabras
import nltk
from nltk.corpus import PlaintextCorpusReader
#corpus_root = './python_projects/blog' 
corpus_root=path
wordlists = PlaintextCorpusReader(corpus_root, '.*', encoding='latin-1')
#wordlists.fileids() # con esto listamos los archivos del directorio

cfd = nltk.ConditionalFreqDist(
        (word,genre)
        for genre in anios
        for w in wordlists.words(genre + '.txt')
        for word in ['casa','mundo','tiempo','vida']
        if w.lower().startswith(word) )
cfd.plot()


#(no funciona)
#Analisis de sentimientos 
'''
from classifier import SentimentClassifier
Exemplo n.º 30
0
import sys
import os.path
import math
import nltk
from nltk.corpus import PlaintextCorpusReader

# sys.argv.append('./gold/pku_training_words.utf8')
# sys.argv.append('./training/pku_training.utf8')
# sys.argv.append('./testing/pku_test.utf8')

assert len(sys.argv) == 4

with open(sys.argv[1], 'rt', encoding='utf8') as f:
    training_words = [w.strip() for w in f.readlines()]

training = PlaintextCorpusReader(*os.path.split(sys.argv[2]))
training_words += list(training.words())
#training_words = list(training.words())
N = len(training_words)
V = len(set(training_words))
fdist = nltk.FreqDist(training_words)
fdist = dict([(w, math.log((c + 1.0) / (N + V))) for w, c in fdist.items()])
defprob = math.log(1.0 / (N + V))

with open(sys.argv[3], 'rt', encoding='utf8') as f:
    test = f.readlines()


def get_DAG(sentence):
    DAG = {}
    T = len(sentence)