Пример #1
0
def preprocess_hotel_review(file_contents, file_contents_test):
    """
    Hotel review preprocess and truthfulness of the hotel review
    :param file_contents:
    :param file_contents_test:
    """
    raw = clean_html(file_contents)
    raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
    sentence_list = tokenize.line_tokenize(raw)
    print sentence_list
    truth_sentences = []
    false_sentences = []
    for sentence in sentence_list:
        sent_arr = re.split(r',', sentence)
        try:
            is_truthful = int(sent_arr[0])
        except ValueError:
            print "is_truthful is not an integer"

        if is_truthful == 1:
            truth_sentences.append(sent_arr[2])
        elif is_truthful == 0:
            false_sentences.append(sent_arr[2])

    truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences))
    false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences))

    raw_test = clean_html(file_contents_test)
    raw_test = re.sub(r'IsTruthFul,review', "", raw_test)
    sentence_list_test = tokenize.line_tokenize(raw_test)
    test_list = []
    test_truth_false_list = []
    truth_count = false_count = i = 0
    for sentence in sentence_list_test:
        sent_arr = re.split(r',', sentence)
        truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict)
        false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict)
        test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex))
        truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0
        #truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0
        if truth_or_false:
            truth_count += 1
        else:
            false_count += 1
        test_truth_false_list.append([i, truth_or_false])
        i += 1

    import csv

    with open("kaggle_sharp.csv", "wb") as f:
        writer = csv.writer(f)
        writer.writerows([['Id', 'Label']])
        writer.writerows(test_truth_false_list)
    print test_list
    print test_truth_false_list
    print truth_count
    print false_count
Пример #2
0
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
Пример #3
0
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
 def parse_corpus(self, corpus_type):
     self.corpus_type = '%s.txt' %corpus_type
     self.corpus = line_tokenize(PlaintextCorpusReader(CORPUS_PATH, self.corpus_type).raw().lower())
     if corpus_type == 'institution':
         for line in range(len(self.corpus)):
             self.corpus[line] = self.corpus[line].split(',')
     return self.corpus
Пример #5
0
 def entries(self, fileids=mwa_ppdb_xxxl_file):
     """
     :return: a tuple of synonym word pairs.
     """
     return [
         tuple(line.split('\t'))
         for line in line_tokenize(self.raw(fileids))
     ]
Пример #6
0
 def __init__(self, doc_dir):
     convertion_style = ""
     parse = Parser(join(ROOT, 'templates', 'event.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
     self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')
Пример #7
0
 def __init__(self, doc_dir):
     convertion_style = "-raw"
     self._eventextractor = EventExtractor(doc_dir)
 	parse = Parser(join(ROOT, 'templates', 'periodic.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
Пример #8
0
def load_topic_words(topic_file):
    """ given a path to a .ts file returns a dictionary of type { string : float }
    mapping topic words to their chi square scores """
    topic_words_dict = dict()
    raw = open(topic_file).read()
    lines = line_tokenize(raw)
    for line in lines:  # no cutoff outside of TopicS 0.1
        pair = line.split(" ")
        topic_words_dict[pair[0]] = float(pair[1])
    return topic_words_dict
Пример #9
0
def load_topic_words(topic_file):
    """ given a path to a .ts file returns a dictionary of type { string : float }
    mapping topic words to their chi square scores """
    topic_words_dict = dict()
    raw = open(topic_file).read()
    lines = line_tokenize(raw)
    for line in lines: # no cutoff outside of TopicS 0.1
        pair = line.split(" ")
        topic_words_dict[pair[0]] = float(pair[1])
    return topic_words_dict
Пример #10
0
 def __init__(self, doc_dir):
     convertion_style = ""
     parse = Parser(join(ROOT, 'templates', 'tcc.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     pages = self._template_metadata['pages']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style)
     self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines()
     self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ')
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc)
     self.linebreak = "\n"
Пример #11
0
 def words(self, lang=None, fileids=None, ignore_lines_startswith='#'):
     """
     This module returns a list of nonbreaking prefixes for the specified
     language(s).
     
     >>> from nltk.corpus import nonbreaking_prefixes as nbp
     >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
     True
     >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
     True
     
     :return: a list words for the specified language(s).
     """
     # If *lang* in list of languages available, allocate apt fileid.
     # Otherwise, the function returns non-breaking prefixes for 
     # all languages when fileids==None.
     if lang in self.available_langs:
         lang = self.available_langs[lang]
         fileids = ['nonbreaking_prefix.'+lang]
     return [line for line in line_tokenize(self.raw(fileids))
             if not line.startswith(ignore_lines_startswith)]
Пример #12
0
def main():
    reader = WordListCorpusReader(path, ['banbagsfb.txt'])
    pages = line_tokenize(reader.raw())
    thispage = pages[4]
    thispage = thispage.raw()

    """
    The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example:
    my_str = r"""This string would "really "suck"" to write if I didn't
    know how to tell Python to parse it as "raw" text with the 'r' character and
    triple " quotes. Especially since I want \n to show up as a backlash followed
    by n. I don't want \0 to be the null byte either!"""

    The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string.

    """

    m = re.search("(\d)", thispage)
    thisitem = m.group(0)
    m = re.search("(\d\d\D\d\d)", thispage)
    thisdate = m.group(0)
    starturl = thispage.find('http')
    endurl = thispage.find(' ', starturl)-2
    thisurl = thispage[starturl:endurl] 
    soup = BeautifulSoup(thispage)
    newpage = soup.findAll(text=True)
    html = replace_all(newpage, reps)
    html = html[11:len(html)]
    postdate = html[0:5]
    posttext = html[5:len(html)]
    print "post date = " + postdate
    print "post text = " + posttext

def replace_all(txt, reps):
    for i, j in reps.iteritems():
        txt = txt.replace(i, j)
    return text

if __name__ == "__main__":
    main()
Пример #13
0
 def words(self, fileids=None, ignore_lines_startswith='\n'):
     return [line for line in line_tokenize(self.raw(fileids))
             if not line.startswith(ignore_lines_startswith)]
Пример #14
0
 # print(file_path)
 # print(content)
 length = len(content) if content is not None else content
 fbar.set_description(f'{file_path}: {length}')
 # TYPE
 if content is None:
     texts = ['']
 elif text_type == 'full':
     texts = [content]
 elif text_type == 'parablank':
     texts = []
     for p in blankline_tokenize(content):
         texts.append(p)
 elif text_type == 'paraline':
     texts = []
     for p in line_tokenize(content):
         texts.append(p)
 else:
     raise NotImplementedError(text_type)
 # NORM
 if norm_type == 'stem':
     texts = [
         ' '.join(
             snow.stem(x) for x in word_tokenize(y)
             if x.isalnum() and x.lower() not in stop)
         for y in texts
     ]
 elif norm_type == 'lem':
     texts = [
         ' '.join(
             morph.parse(x)[0].normal_form
Пример #15
0
# Load Libraries
from nltk.tokenize import line_tokenize


sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled \
peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled \
peppers, Wheres the peck of pickled peppers Peter Piper picked ?"


sent_list = line_tokenize(sentence)
print "No sentences = %d"%(len(sent_list))
print "Sentences"
for sent in sent_list: print sent

# Include new line characters
sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled\n \
peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled\n \
peppers, Wheres the peck of pickled peppers Peter Piper picked ?"

sent_list = line_tokenize(sentence)
print "No sentences = %d"%(len(sent_list))
print "Sentences"
for sent in sent_list: print sent
Пример #16
0
 def entries(self, fileids=mwa_ppdb_xxxl_file):
     """
     :return: a tuple of synonym word pairs.
     """
     return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
Пример #17
0
# Load Libraries
from nltk.tokenize import line_tokenize

sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled \
peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled \
peppers, Wheres the peck of pickled peppers Peter Piper picked ?"

sent_list = line_tokenize(sentence)
print "No sentences = %d" % (len(sent_list))
print "Sentences"
for sent in sent_list:
    print sent

# Include new line characters
sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled\n \
peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled\n \
peppers, Wheres the peck of pickled peppers Peter Piper picked ?"

sent_list = line_tokenize(sentence)
print "No sentences = %d" % (len(sent_list))
print "Sentences"
for sent in sent_list:
    print sent
Пример #18
0
      print('{0}: {1}, '.format(k, ss[k]), end='')
    print()
"""

from nltk.tokenize import sent_tokenize, word_tokenize,line_tokenize

f = open("2012LTinsultsLKML.tsv.txt", "r")
all_text = f.read()
print(all_text)

#for i in word_tokenize(all_text):
 # print(i)
print(word_tokenize(all_text))

t_hash = "#"
for i in line_tokenize(all_text):
  if t_hash not in i:
    print(i)

# preprocessing the text to remove the unwanted part and making the whole text in lowercase 
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("<!--?.*?-->","",text)
    """
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
Пример #19
0
import nltk
from nltk import word_tokenize
from nltk.tokenize import line_tokenize
from nltk.corpus import gutenberg
from nltk.model import build_vocabulary
from nltk.model import count_ngrams
from nltk.model import MLENgramModel
from nltk.model import LidstoneNgramModel
# load doc into memory

raw = open('datasets/WW_Dataset.txt', 'r').read()
print(raw[:75])

tokens = word_tokenize(raw)
print(len(tokens))
lines = line_tokenize(raw)
test_lines = lines[3:5]
test_words = [w for s in test_lines for w in s]

print(test_words[:5])
corpus = [w.lower() for w in tokens]
text = nltk.Text(tokens)
words = [w.lower() for w in tokens]
print(words[:10])
vocab = sorted(set(words))
print(len(vocab))
spl = int(95*len(corpus)/100)
train = text[:spl]
test = text[spl:]
vocab = build_vocabulary(2, words)
bigram_counts = count_ngrams(2, vocab, text)
Пример #20
0
 def words(self, fileids=None):
     return line_tokenize(self.raw(fileids))
Пример #21
0
    perplexity(file_contents_test, unigrams_probability_dict, bigrams_probability_dict)


<<<<<<< Updated upstream
def preprocess_hotel_review(file_contents, file_contents_test):
    """
    Hotel review preprocess and truthfulness of the hotel review
    :param file_contents:
    :param file_contents_test:
    """
=======
def truthful_hotel_review(file_contents, file_contents_test):
>>>>>>> Stashed changes
    raw = clean_html(file_contents)
    raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
    sentence_list = tokenize.line_tokenize(raw)
    #print sentence_list
    truth_sentences = []
    false_sentences = []
    for sentence in sentence_list:
        sent_arr = re.split(r',', sentence)
        try:
            is_truthful = int(sent_arr[0])
        except ValueError:
            print "is_truthful is not an integer"

        if is_truthful == 1:
            truth_sentences.append(sent_arr[2])
        elif is_truthful == 0:
            false_sentences.append(sent_arr[2])