示例#1
0
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
示例#2
0
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
示例#3
0
def extractParasInList(name):
    corpuslocation ='/Users/anis/seniorProject/aligned Paragraphs/algebra'
    reader = PlaintextCorpusReader(corpuslocation, '.*\.txt')
    # This gives the list of paragraphs. every paragraph list contains ist of sentences
    # So it is a list of lists. Bunch of sentenses as a list joins together to make  		#lists of pararagraph
    pList = []
    paragraphlist =  reader.paras(name) #'simpleTuring.txt'
    numpara = len(paragraphlist)
    for sentlist in paragraphlist:
        #print sentlist
        numsent = len(sentlist)
        #print type(sentlist),
        #print numsent
     	paraAsAList = []
     	# this loops through all the sentence lists and make them one list'''
        for i in range(numsent):
        		paraAsAList = paraAsAList + sentlist[i]	
        #print paraAsAList # this is the whole parapragph as one list
     	paraAsAString = ""
     	for word in paraAsAList:
        		paraAsAString = paraAsAString + word + str(" ")
        #print paraAsAString
        pList.append(paraAsAString)
        #print len(pList)
    return pList
def train():

   wordlists = PlaintextCorpusReader('', file_path)

   st = stemmer()
   
   # Get blocks of text using NLTK
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # LOGIC
   #       If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as 
   #       [posi/nega]tive

   # Count words
   word_features = []

   # Go through paragraphs
   for p in paras:

      # Classify S
      score_positive_negative = 0
      for s in p:
         for word in s:

            word = st.stem(word)

            if word in words_positive:
               score_positive_negative += 1
            elif word in words_negative:
               score_positive_negative -= 1
   
      # Record class of paragraph for any words present
      for s in p:
         for word in s:

            word = st.stem(word)

            if score_positive_negative > 0:
               word_features.append( ({"word": word}, "+") )
            elif score_positive_negative < 0:
               word_features.append( ({"word": word}, "-") )
            else:
               word_features.append( ({"word": word}, " ") )

   # Create and return classifier
   classifier = nltk.NaiveBayesClassifier.train(word_features)
   return classifier
def main():

   st = stemmer()

   # Get data
   wordlists = PlaintextCorpusReader('', file_path)
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # Train
   classifier = train()

   # Get class probabilities (for MAP estimation)
   counts = {"P":0, "-":0, "N":0}
   for i in range(0,len(paras)):
      for s in paras[i]:

         score_pos = 0
         score_neg = 0

         # Classify paragraph
         for word in s:

            word = st.stem(word)

            feature = {"word":word}
            classified = classifier.classify(feature)

            if classified == "+":
               score_pos += 1
            elif classified == "-":
               score_neg += 1

         # Record result
         if score_pos > score_neg:
            counts["P"] += 1
         elif score_pos < score_neg:
            counts["N"] += 1
         else:
            counts["-"] += 1

   # Done!
   print counts
示例#6
0
def extractParasInList(name):
    corpuslocation ='/home/aniszaman/seniorProject/combined/carnivore'
    reader = PlaintextCorpusReader(corpuslocation, '.*\.txt')
    # This gives the list of paragraphs. every paragraph list contains ist of sentences
    # So it is a list of lists. Bunch of sentenses as a list joins together to make  		#lists of pararagraph
    pList = []
    paragraphlist =  reader.paras(name) #'simpleTuring.txt'
    numpara = len(paragraphlist)
    for sentlist in paragraphlist:
        numsent = len(sentlist)
     	paraAsAList = []
     	# this loops through all the sentence lists and make them one list'''
        for i in range(numsent):
     		paraAsAList = paraAsAList + sentlist[i]	
     	paraAsAString = ""
     	for word in paraAsAList:
        		paraAsAString = paraAsAString + word + str(" ")
        pList.append(paraAsAString)
    return pList
示例#7
0
    def extract_data(self,
                     filepath,
                     ind_features=_PARAIND_FEAT,
                     dep_features=_PARADEP_FEAT,
                     labels_per_sent=None,
                     labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" +
                                           string.punctuation + "]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"

        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([
                numpy_filepath_pca,
        ]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup,
                     len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE,
                                          ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(
            self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)

        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn,
                         self.dictVectorizer.get_feature_names(),
                         arff_filepath,
                         filename + "_RAW",
                         labels_per_window,
                         file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)

        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(
                matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca,
                             feature_names,
                             arff_filepath_pca,
                             filename + "_PCA95",
                             labels_per_window,
                             file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)

            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
示例#8
0
def main(print_out, motifs, chapter):
   wordlists = PlaintextCorpusReader('', 'Punctuated/pot_ch[12345]\.txt')

   #rep_words = nltk.FreqDist(brown.words()) # Get representative word counts

   st = LancasterStemmer()
   #st = RegexpStemmer('ing$|s$|e$', min=4)

   for i in range(1,6): 
   
      if i != chapter:
        continue   
   
      g = nx.Graph()

      words = wordlists.words('Punctuated/pot_ch{!s}.txt'.format(str(i)))
      paras = wordlists.paras('Punctuated/pot_ch{!s}.txt'.format(str(i)))

      # Generate HTML
      #with open("test" + str(i) + ".txt", "w+") as fi:
      #   output = generate_html_level2(wordlists, st, words, paras, i)
      #   fi.write(output)
      
      json_dict = {}
      json_dict["nodes"] = []
      json_dict["edges"] = []

      # Get correlation coefficients
      corr_data = get_corr_coefs(wordlists, st, words, paras, print_out, motifs)
      corr_coefs = corr_data[0]
      corr_freqs = corr_data[1]

      # ---------------------------------- NetworkX ----------------------------------
      # Get NetworkX nodes
      nx_added_nodes = []
      for m1 in corr_coefs:
         g.add_node(m1)

      # Get NetworkX edges
      for m1 in corr_coefs:
         for m2 in corr_coefs[m1]:

             # Avoid repeats
             if m1 <= m2:
                 continue

             g.add_edge(m1, m2)

      # -------------------------------- End NetworkX --------------------------------

      # -------------------------------------- d3.js --------------------------------------
      # Get d3-js nodes
      json_node_numbers = dict()
      square_size = 0
      for m1 in corr_coefs:

         sz = int(min(corr_freqs[m1]/3.0,50))*3
         #print sz

         json_node  = {
                                    "name": m1,
                                    "size": str(sz),
                                    "color": "#aaaaaa"
                                }
         json_dict["nodes"].append(json_node)
         json_node_numbers[m1] = len(json_node_numbers)

      # Get d3-js edges
      m1m2 = 0;
      for m1 in corr_coefs:
         for m2 in corr_coefs[m1]:

             # Avoid repeats
             if m1 <= m2:
                 continue

             # No need to worry about repeats, since corr_coefs won't contain them
             edge_size = corr_coefs[m1][m2]
             #print "ES " + m1 + "/" + m2 + ": " + str(edge_size)
             json_edge = {
                                   "name": m1 + "-" + m2,
                                   "source": json_node_numbers[m1],
                                   "target": json_node_numbers[m2],
                                   "size": str(edge_size)
                                   }
             json_dict["edges"].append(json_edge)

      # Add boundary d3-js node
      json_dict["nodes"].append({"id":"the-end",
                                                    "x":square_size,
                                                    "y":square_size,
                                                    "size":"1",
                                                    "color":"#000000"
      })

      # Write JSON to file
      if not print_out:
          with open("OFFICIAL/data" + str(i) + ".json", "w+") as fi:
            fi.write("var json_str_" + str(i) + "=" + json.dumps(json_dict, fi, indent = 2))
      else:
          mn = smin(motifs[0],motifs[1])
          mx = smax(motifs[0],motifs[1])

          path = "OFFICIAL/inserts/" + mn + ("-" if len(mn) != 0 else "") + mx + "-" + str(chapter) + ".html"
          print path
          with open(path, "w+") as fi:
             fi.write(corr_data[2])
示例#9
0
    def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, 
            [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])            
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"
        
        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([numpy_filepath_pca,]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)
        
        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)
        
        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)
            
            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
示例#10
0
NUM_WORDS = 20
wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt')

def clean_words(words):
    #convert everything to lower case
    words = [w.lower() for w in words]
    #remove period from end of sentences
    words =  [re.sub('\.','',w) for w in words]
    #only keep alphabetic strings
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if not w in stopwords.words('english')]
    #do stemming "goes" => "go"
    words = [nltk.PorterStemmer().stem(w) for w in words]
    return words

paras = [sum(para, []) for para in wordlist.paras('ofk.txt')]
words = clean_words(wordlist.words('ofk.txt'))
groups = []
for i in range(0, len(paras), GROUP_LENGTH):
    group = sum(paras[i : min(i + GROUP_LENGTH, len(paras))], [])
    groups.append(group)

freqs = []
for group in groups:
    freq = FreqDist(clean_words(group))
    table = {w:freq[w] for w in freq}
    freqs.append(table)

top_words = [w for w in FreqDist(words)][:NUM_WORDS]
def get_word_freqs(word):
    return {'word':word, 'values':[{'x':i, 'y':freqs[i].get(word, 0)} for i in range(len(freqs))]}
示例#11
0
class Corpus:
    """
    The Corpus class creates a corpus, that is a set of speeches to be analyzed.
    The constructor takes a list of files from the speeches folder as parameter.
    Example. ["1977.txt", "1980.txt"]
    If the list is empty, the corpus is created with the complete collection of speeches (from 1975 to 2017)
    """
    def __init__(self, files):
        if not files:
            self.corpus = PlaintextCorpusReader('./data/speeches', '.*')
        else:
            self.corpus = PlaintextCorpusReader('./data/speeches', files)
        self.speech = Speech(self.corpus.raw(), self.corpus.words(),
                             self.corpus.sents(), self.corpus.paras(), None,
                             None, None, None)
        self.speeches = build_speeches_dict(self.corpus)
        self.years = [
            int(year.split('.')[0]) for year in self.corpus.fileids()
        ]
        complementary_years = list(
            set(os.listdir("./data/speeches")) -
            set([str(years) + '.txt' for years in self.years]))
        if not files:
            self.complementary = None
            self.unique_words = None
        else:
            self.complementary = ComplementaryCorpus(complementary_years)
            self.unique_words = [
                word for word in self.speech.tokens
                if word not in self.complementary.speech.tokens
            ]

    def to_speeches_list(self):
        speeches_list = []
        for key, speech in self.speeches.items():
            speeches_list.append(speech.speech_to_dict())
        return speeches_list

    def print_graph(self, my_words):
        """
        :param my_words: list of words whose frequency is to be plotted
        :return: a frequency plot
        """
        cfd = nltk.ConditionalFreqDist((target, fileid)
                                       for fileid in self.speeches.keys()
                                       for w in self.speeches[fileid].tokens
                                       for target in my_words
                                       if w.lower() == target)
        cfd.plot()

    def get_files(self):
        """
        :return: list of files in the Corpus object
        """
        return self.corpus.fileids()

    def unique_words_freq(self):
        """
        :return: the words in the corpus object that are unique to that corpus
        (i.e., these words dont appear in the rest of the speeches)
        """
        if self.unique_words is None:
            return "The corpus contains all speeches, so no comparison can be made"
        else:
            return nltk.FreqDist(self.unique_words).most_common(50)

    def radiography(self):
        """
        The method that returns the lexical radiography of the corpus
        :return: prints lexical analysis from the corpus
        """
        print("Lexical data for period from " + str(self.years[0]) + " to " +
              str(self.years[-1]))
        print(str(len(self.years)) + " total speeches")
        print(str(len(self.corpus.words())) + " total words")
        print(
            str(len(self.corpus.words()) / len(self.get_files())) +
            " words per speech")
        print("Frequency distribution:")
        print(self.speech.frequencies())
        print("Content words frequency distribution:")
        print(self.speech.most_frequent_content_words())
        print("Unique words frequency distribution:")
        print(self.unique_words_freq())
        print("Most frequent content bigrams:")
        print(self.speech.most_frequent_bigrams())
        print("Most frequent content trigrams:")
        print(self.speech.most_frequent_trigrams())
        print("#######################################")
示例#12
0
def text_to_paras(name, book_dir):
    book_id = get_book_id(name)
    book_path = get_path(book_id, book_dir)
    corpus = PlaintextCorpusReader(book_path, '.*')
    paragraphs = corpus.paras('book%s.txt' % book_id)
    return paragraphs
示例#13
0
def text_to_paras(book_id):
    #put in directory of the .txt files
    book_path = get_path(book_id, book_dir)
    corpus = PlaintextCorpusReader(book_path, '.*')
    paragraphs = corpus.paras('book%s.txt' % book_id)
    return paragraphs
示例#14
0
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import wordnet as wn

# Grab stopwords.
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords) #127

# Read the plain text.
corpus_root = 'corpora'
aow = PlaintextCorpusReader(corpus_root, 'Art-of-War.txt')

aow.fileids() #['artofwar.txt']
aow.words() #['THE', 'ART', 'OF', 'WAR', 'BY', 'SUN', 'TZU', ...]
len(aow.words()) #13038
len(aow.sents()) #943
len(aow.paras()) #399
len([s for s in aow.sents() if 'enemy' in s]) #111
len([s for s in aow.sents() if 'enemies' in s]) #1
len([s for s in aow.sents() if 'ally' in s]) #2
len([s for s in aow.sents() if 'allies' in s]) #3
len([s for s in aow.sents() if 'spy' in s]) #8
len([s for s in aow.sents() if 'spies' in s]) #11

# Extract the list of sentences with /^enem(?:y|ies)$/i words.
enemy_sents = []
for s in aow.sents():
    for w in s:
        if w.lower().startswith('enem'):
            enemy_sents.append(s) # TODO Skip if seen.

len(enemy_sents) #126 XXX Can contain duplicates. Fix TODO above.
示例#15
0
    else:
        return ''


# Reading corpus
corpus_root = '/Users/elgayaro/nltk_data/corpora/tudiab'
ww = PlaintextCorpusReader(corpus_root,
                           r'(?!README|\.).*\.txt',
                           para_block_reader=read_line_block)

# Displaying sample corpus
print('The files in this corpus are: ', ww.fileids())
print('Number of words (before pre-processing) in the corpus = ',
      len(set(ww.words())))
print('There are documents/paragraphs/threads in the corpus = ',
      len(ww.paras()))

print("+++++++++ Sample 1000 un-filtered words +++++++++")
words = set(ww.words())
print(random.sample(words, 1000))

# print(ww.words())
# print(ww.sents()[0])
# print(ww.paras()[10])
# print(ww.raw()[:10])

# Defining stop words
stopwords = stopwords.words('english')
add_stopwords = [
    'http', 'https', '://', 'www', 'com', '8800', '...', '....', 'yep', '.).',
    '](#', '.:).', '++..', 'github', 'etc', 'also', 'org', 'gee', 'let',
示例#16
0
'''def beatles_corpus.sents(self, fileids=None):
    """
    :return: the given file(s) as a list of
        sentences or utterances, each encoded as a list of word
        strings.
    :rtype: list(list(str))
    """
    if self._sent_tokenizer is None:
        raise ValueError('No sentence tokenizer for this corpus')

    return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
                 for (path, enc, fileid)
                 in self.abspaths(fileids, True, True)])
'''
k = 0
custom_corpus = nltk.Text(beatles_corpus.words())
for i in beatles_corpus.paras():
    # Iterate through songs, printing out contents
    # print "Song # " + str( k )
    k += 1
    l = 0
    while l < len(i):
        # print "Line " + str(l)
        # print i[l]
        l += 1
    # for j in i:
    # print j[0]

# new_song = custom_corpus.generate(100)