Exemplos de stripNonAlphaNum em Python, exemplos de obo.stripNonAlphaNum em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: keywords.py Projeto: digcat/cmistest

def getKeywords(pdfFile,Occur):

   tikaurl= tika_obo.getTikaAddress()
   parsed = parser.from_file(pdfFile, tikaurl)

   metadata = parsed["metadata"]
   doccontent = parsed["content"]

   fullwordlist = obo.stripNonAlphaNum(doccontent)
   wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
   dictionary = obo.wordListToFreqDict(wordlist)
   sorteddict = obo.sortFreqDict(dictionary)
   count = 0
   keywords = [] 
   shortkey = []
   maxoccur = Occur
   for s in sorteddict: 
       numocc = int(s[0])
       word = s[1].encode('utf-8')
       if numocc > maxoccur:
          keyword = { word : str(numocc) }
          keywords.append(keyword)
          if len(word)>6:
             shortkey.append(word.lower())
       count = count + 1
   if Occur > 0:
       return shortkey
   return keywords

Exemplo n.º 2

0

Exibir arquivo

Arquivo: arXiv-genCorpus.py Projeto: rocksonchang/Science-Trends

 def __iter__(self):
     for year in years:
         fname = 'arXiv-meta-{}.xml'.format(year)
         #path=os.path.dirname(os.getcwd())+'\\DATA\\'+fname
         path = os.path.dirname(os.getcwd()) + '/DATA/SORTED/' + fname
         for event, elem in ET.iterparse(path):
             if elem.tag == 'description':
                 desc = obo.removeStopwords(obo.stripNonAlphaNum(elem.text),
                                            obo.stopwords)
                 yield dictionary.doc2bow(desc)

Exemplo n.º 3

0

Exibir arquivo

def getPositive(tf):
    file = open(tf, encoding="Latin-1")
    text = file.read()
    file.close()

    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.rabinKarp1(fullwordlist, obo.stopwords, 101)
    wordlist1 = obo.rabinKarp2(wordlist, posiwordlist, 101)

    return len(wordlist1)

Exemplo n.º 4

0

Exibir arquivo

def PostiveOrNegative(tf=''):
    file = open(tf, encoding="Latin-1")
    text = file.read()
    file.close()

    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.rabinKarp1(fullwordlist, obo.stopwords, 101)
    wordlist1 = obo.rabinKarp2(wordlist, posiwordlist, 101)
    wordlist2 = obo.rabinKarp2(wordlist, negawordlist, 101)

    if len(wordlist1) > len(wordlist2):
        return ['Positive', len(wordlist1) / (len(wordlist1) + len(wordlist2))]
    else:
        return ['Negative', len(wordlist1) / (len(wordlist1) + len(wordlist2))]

Exemplo n.º 5

0

Exibir arquivo

def graphAndEvaluation(tf=''):
    file = open(tf, encoding="Latin-1")
    text = file.read()
    file.close()

    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.rabinKarp1(fullwordlist, obo.stopwords, 101)
    wordlist1 = obo.rabinKarp2(wordlist, posiwordlist, 101)
    wordlist2 = obo.rabinKarp2(wordlist, negawordlist, 101)

    wordString = listToString(wordlist)
    dictionary = obo.wordListToFreqDict(wordlist)
    dictionaryposi = obo.wordListToFreqDict(wordlist1)
    dictionarynega = obo.wordListToFreqDict(wordlist2)
    sorteddict = obo.sortFreqDict(dictionary)
    sorteddictposi = obo.sortFreqDict(dictionaryposi)
    sorteddictnega = obo.sortFreqDict(dictionarynega)

    # for s in sorteddictposi: print(str(s))

    N = 100000
    t = list(dictionary.keys())
    y = list(dictionary.values())
    fig1 = go.Figure(data=go.Scatter(x=t, y=y, mode='markers'))
    fig1.update_layout(
        title={
            'text': tf + " Word Counts",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        })
    fig1.show()

    x1 = wordlist1
    x2 = wordlist2

    fig = go.Figure()
    fig.update_layout(
        title={
            'text': tf + " Negative and Positive Histogram",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        })
    fig.add_trace(go.Histogram(histfunc="sum", x=x1, name="Positive Word"))
    fig.add_trace(go.Histogram(histfunc="sum", x=x2, name="Negative Word"))

    fig.show()

Exemplo n.º 6

0

Exibir arquivo

def count():
    form = WordForm()
    if form.validate_on_submit():
        url = form.url.data
        response = requests.get(url)
        html = response.content.decode("utf-8")
        text = obo.stripTags(html).lower()
        fullwordlist = obo.stripNonAlphaNum(text)
        wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
        dictionary = obo.wordListToFreqDict(wordlist)
        sorteddict = obo.sortFreqDict(dictionary)
        for s in sorteddict[:21]:
            flash(str(s))
        return redirect(url_for('index'))
    return render_template('count.html',
                           title='Word Count Application',
                           form=form)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: lolmax.py Projeto: tiwansh/phishDetector

def hitString(limit, url):
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "lxml")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # get text
    text = soup.get_text()
    text = text.lower()
    '''# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)'''

    #text = text.lower()
    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
    dictionary = obo.wordListToFreqDict(wordlist)
    sorteddict = obo.sortFreqDict(dictionary)
    count = 0
    global retStr
    for s in sorteddict:
        mys = (str(s[1:]))
        mys2 = mys.strip("(,)")
        mys3 = mys2[1:]
        mys4 = str(mys3.strip("''"))
        mys4 = str(mys4)
        if re.match("^-?[0-9]+$", mys4):
            continue
        count += 1
        #print mys4
        retStr += str(mys4) + ' '
        if count == limit:
            break
    return retStr

Exemplo n.º 8

0

Exibir arquivo

Arquivo: arXiv-metadata-process.py Projeto: rocksonchang/arXiv-metadata

		
		title=titles[rec]
		text=title+desc

		date=datetime.strptime(dates[rec],'%Y-%m-%d')		
		Q=int(ceil(date.month/3.)-1)
		ind = 4*(date.year-year0)+Q
		NRecQuarter[ind]+=1

		fulltextQuarter[Q]=fulltextQuarter[Q]+text		
		
	NRecRunning=NRecRunning+NRecYear[x]
	

	for q in range(4):
		desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q])
		desc_wordlist = obo.removeStopwords(desc_fullwordlist,obo.stopwords)	
		desc_dictionary = obo.wordListToFreqDict(desc_wordlist)
		desc_sorteddict = obo.sortFreqDict(desc_dictionary)

		topWords.append(desc_sorteddict[:5000])
		
		print ('Year: {}; Quarter: Q{}; Num. entries: {}'.format(years[x],q+1,NRecQuarter[4*(date.year-year0)+q]))				
		#for s in desc_sorteddict[:10]: 	print(str(s))
		#print('\n')
print('\n')
	
#################################################################################################
#################################################################################################
## Pickle?
## Pickle?

Exemplo n.º 9

0

Exibir arquivo

            linewords = line.split()
            resultwords = [
                word1 for word1 in linewords if word1 not in stopword
            ]
            result = ' '.join(resultwords)
            #line = line.lower().replace(stopword, "")
            line = regexnumber.sub('', result)
        stopwords.close()
        stemmout.write(result + '\n')

    stemmout.close()
    dataset.close()

    dataset = open('stemm-result/' + title + '-steem.txt', 'r')

    fullwordlist = obo.stripNonAlphaNum(dataset.read().lower())
    wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
    dictionary = obo.wordListToFreqDict(wordlist)
    sorteddict = obo.sortFreqDict(dictionary)

    dataset.close()
    dataset = open('stemm-result/' + title + '-steem.txt', 'r')

    lines = []
    regex = re.compile(r'[\n\r\t]')

    for sentence in dataset:
        line = regex.sub('', sentence)
        lines.append(str(line))

    i = 0

Exemplo n.º 10

0

Exibir arquivo

Arquivo: diccio.py Projeto: QKRacha/Diccio

                    help='Cantidad de numeros antes y despues de la palabra',
                    type=int,
                    required=True)
args = parser.parse_args()

statusCode = 0
try:
    r = requests.get(args.url)  #cargamos la respuesta de GET de la pàgina
    statusCode = r.status_code
except:
    print(
        "Formato de la web erroneo, debe ser del estilo http://www.google.com")

if statusCode == 200:
    text = obo.stripTags(r.text)  #quitamos las etiquetas y pasamos a minuscula
    fullwordlist = obo.stripNonAlphaNum(
        text)  #quitamos los que no son alfanumericos
    if args.stopwords:
        fullwordlist = obo.removeStopwords(
            fullwordlist, args.stopwords
        )  #eliminamos las palabras de uso comun segun el idioma
    if args.long:
        fullwordlist = obo.excludeTwo(
            fullwordlist,
            args.long)  #eliminamos las palabras con menos de 2 caracteres
    dictionary = obo.wordListToFreqDict(
        fullwordlist)  #nos devuelve un diccionario palabra - frequencia
    sorteddict = obo.sortFreqDict(
        dictionary
    )  #ordena las palabras por su frequencia (nos han devuelto una lista de listas)
    if args.tipo == 'simple':
        obo.makePassfile(sorteddict,

Exemplo n.º 11

0

Exibir arquivo

from gensim import corpora
import obo
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

## specify range of data
years = range(1992, 2017)
fileName = 'fullDictionary.dict'

## build dictionary from stream, clean up as we go
print('\nGenerating dictionary for years {} to {} ... '.format(
    years[0], years[-1]))
doc_stream = (
    obo.removeStopwords(obo.stripNonAlphaNum(elem.text), obo.stopwords)
    for year in years
    for event, elem in ET.iterparse('RAW/arXiv-meta-{}.xml'.format(year))
    if elem.tag == 'description')
dictionary = corpora.Dictionary(doc_stream)
print('Finished!')
print(dictionary)
## save to file
dictionary.save(fileName)
print('Dictionary saved as {}'.format(fileName))
'''
## some output checks
i=-1
for d in dictionary.token2id:
	i+=1
	if i<20:

Exemplo n.º 12

0

Exibir arquivo

Arquivo: arXiv-metadata-process.py Projeto: rocksonchang/arXiv-metadata

        desc = descriptions[rec]

        title = titles[rec]
        text = title + desc

        date = datetime.strptime(dates[rec], '%Y-%m-%d')
        Q = int(ceil(date.month / 3.) - 1)
        ind = 4 * (date.year - year0) + Q
        NRecQuarter[ind] += 1

        fulltextQuarter[Q] = fulltextQuarter[Q] + text

    NRecRunning = NRecRunning + NRecYear[x]

    for q in range(4):
        desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q])
        desc_wordlist = obo.removeStopwords(desc_fullwordlist, obo.stopwords)
        desc_dictionary = obo.wordListToFreqDict(desc_wordlist)
        desc_sorteddict = obo.sortFreqDict(desc_dictionary)

        topWords.append(desc_sorteddict[:5000])

        print('Year: {}; Quarter: Q{}; Num. entries: {}'.format(
            years[x], q + 1, NRecQuarter[4 * (date.year - year0) + q]))
        #for s in desc_sorteddict[:10]: 	print(str(s))
        #print('\n')
print('\n')

#################################################################################################
#################################################################################################
## Pickle?

Exemplo n.º 13

0

Exibir arquivo

Arquivo: html-to-freq.py Projeto: justinminsk/IntroToPython

import requests, obo
url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'
pagetext = requests.get(url)
HTML = pagetext.text
text = obo.stripTags(HTML).lower()  # make lower case
wordlist = obo.stripNonAlphaNum(
    text)  # convert to list of words, no punctuation
dictionary = obo.wordListToFreqDict(
    wordlist)  # add words, counts to dictionary
sorteddict = obo.sortFreqDict(dictionary)  # sort word list by frequency
for s in sorteddict:
    print(str(s))

Exemplo n.º 14

0

Exibir arquivo

Arquivo: main.py Projeto: TheResearchProject/textprocessing

def hello_world():
    if request.method == "GET":
        return redirect("/app/index.html")
    else:
        pprint.pprint(request.form)
        pprint.pprint(request.files)

        #Language check
        if request.form['language'] not in ['english', 'dutch']:
            return jsonify(status='error', message="Invalid language!")

        #Input normalization
        if request.form['upload_option'] == 'text_field':
            input_text = request.form['upload_textarea']
        elif request.form['upload_option'] == 'url':
            page_text = requests.get(request.form['upload_url']).text
            soup = BeautifulSoup(page_text, "html.parser")
            input_text = soup.text
        elif request.form['upload_option'] == 'file':
            input_text = UnicodeDammit(
                request.files.get('upload_file').read()).unicode_markup

        #Stemmer selection
        if request.form['stemmer'] == 'no_stemmer':
            stemmer = None
        elif request.form['stemmer'] == 'porter':
            if request.form['language'] != 'english':
                return jsonify(status='error',
                               message="Invalid language for stemmer porter!")
            stemmer = PorterStemmer()
        elif request.form['stemmer'] == 'snowball':
            stemmer = SnowballStemmer(request.form['language'])
        else:
            return jsonify(status='error', message="Invalid stemmer!")

        #Lemmatizer selection
        if request.form['lemmatizer'] == 'lemmatizer_off':
            lemmatizer = None
        elif request.form['language'] == 'english':
            lemmatizer = lemmatizer_en
        else:
            lemmatizer = lemmatizer_nl

        #Stopwords selection
        if request.form['stopwords'] == 'no_stopwords':
            stopwords = None
        elif request.form['stopwords'] == 'our_stopwords':
            stopwords = obo.stopwords
        elif request.form['stopwords'] == 'custom_stopwords':
            custom_stopword_text = UnicodeDammit(
                request.files.get(
                    'custom_stopword_file').read()).unicode_markup
            stopwords = obo.stripNonAlphaNum(custom_stopword_text)

        #Process the text
        input_text_word_count = 0
        resulting_text = ""
        final_wordlist = []
        for word_type, word in text_processor.parse_text(input_text):
            if word_type == "non-word":
                resulting_text += word
            else:
                input_text_word_count += 1
                processed_word = word
                if stemmer:
                    processed_word = stemmer.stem(processed_word)
                if lemmatizer:
                    processed_word = lemmatizer(processed_word)
                if not stopwords or processed_word not in stopwords:
                    if request.form['exclude_vowels'] == 'exclude_vowels_yes':
                        if request.form['language'] == 'english':
                            regex = re_vowel_en
                        else:
                            regex = re_vowel_nl
                        processed_word = regex.sub("", processed_word)
                    resulting_text += processed_word
                    final_wordlist.append(processed_word)

        dictionary = obo.wordListToFreqDict(final_wordlist)
        sorteddict = obo.sortFreqDict(dictionary)

        ignore_results_amount = int(request.form['ignore_results_amount'])

        if ignore_results_amount > 0:
            initial_index = ignore_results_amount
            ignored_words = [word for rank, word in sorteddict[:initial_index]]
            sorteddict = sorteddict[initial_index:]
            new_text = ""
            new_wordlist = []
            for word_type, word in text_processor.parse_text(resulting_text):
                if word_type == "non-word":
                    new_text += word
                elif word not in ignored_words:
                    new_text += word
                    new_wordlist.append(word)
            resulting_text = new_text
            final_wordlist = new_wordlist

        else:
            initial_index = 0

        #Do the math!
        input_text_char_count = len(input_text)
        word_count = len(final_wordlist)
        distinct_words_count = len(sorteddict)
        words = []
        frequencies = []
        word_cloud = []
        for frequency, word in sorteddict:
            words.append(word)
            frequencies.append(frequency)
            word_cloud.append([word, frequency])

        acum_perc = Decimal(0)
        percentages = []
        acum_perc_list = []
        for freq in frequencies:
            perc = Decimal((freq * 100.0) / word_count)
            percentages.append(round(perc, 2))
            acum_perc += perc
            acum_perc_list.append(round(acum_perc, 2))

        logarithms = []
        for i in range(len(sorteddict)):
            logarithms.append((math.log(i + 1), math.log(frequencies[i])))

        #Calculate Linear regression
        #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq
        x = numpy.array([math.log(f) for f in frequencies])
        y = numpy.array(
            [math.log(rank) for rank in range(1, distinct_words_count + 1)])
        A = numpy.vstack([x, numpy.ones(len(x))]).T
        m, c = numpy.linalg.lstsq(A, y)[0]

        #Calculate the regression line start and end,
        #  and sort making the start be the one with the lower X value
        #  (highcharts requires this)
        regline_start = (0, c)
        regline_end = (math.log(distinct_words_count),
                       math.log(distinct_words_count) * m + c)
        regression_line = {'start': regline_start, 'end': regline_end}

        return jsonify(status='success',
                       words=words,
                       frequencies=frequencies,
                       percentages=percentages,
                       acum_perc_list=acum_perc_list,
                       logarithms=logarithms,
                       regression_line=regression_line,
                       resulting_text=resulting_text,
                       input_text_char_count=input_text_char_count,
                       input_text_word_count=input_text_word_count,
                       output_text_word_count=word_count,
                       word_cloud=word_cloud,
                       sorteddict=sorteddict)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: arXiv-similarity.py Projeto: rocksonchang/Science-Trends

#################################################################################################
## Perform query
#################################################################################################
query = "quantum simulation phase transition"
query = "quantum error correction shor"
query = "three dimensional doppler cooling theory helium"
#query = "response particle periodic potential effective mass dynamics bose einstein condensate"
query = "anderson localization matter waves"
Ntop = 300
print('\n')
print('QUERY: ' + query)
print('TOP {} hits: '.format(Ntop) + query)

## find similarity of query to articles in corpus
vec_bow = dictionary.doc2bow(
    obo.removeStopwords(obo.stripNonAlphaNum(query), obo.stopwords))
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]  # perform a similarity query against the corpus
sims = sorted(enumerate(sims), key=lambda item: -item[1])

#################################################################################################
## Prepare data for plotting
#################################################################################################
## prepare data for scatter plot -- article index, score, date, titles
iArr = [x[0] for x in sims[:Ntop]]
sArr = [x[1] for x in sims[:Ntop]]
dArr = [year[i] for i in iArr]
labelsArr = [titles[i] for i in iArr]
datesArr = [dates[i] for i in iArr]

## prepare data for line plot -- data binned into quarters

Exemplo n.º 16

0

Exibir arquivo

Arquivo: html-to-freq-2.py Projeto: justinminsk/IntroToPython

import requests, obo
url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html'
pagetext = requests.get(url)
HTML = pagetext.text
text = obo.stripTags(HTML).lower()  # convert to lower case
fullwordlist = obo.stripNonAlphaNum(text)  # only words, into list
wordlist = obo.removeStopwords(fullwordlist,
                               obo.stopwords)  # remove common useless words
dictionary = obo.wordListToFreqDict(
    wordlist)  # add words and counts to dictionary
sorteddict = obo.sortFreqDict(dictionary)  # sort word list by frequency

if __name__ == '__main__':
    for s in sorteddict:
        print(str(s))

Exemplo n.º 17

0

Exibir arquivo

import requests, obo

url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

pagetext = requests.get(url)
HTML = pagetext.text
wordlist = HTML.split()
text = obo.stripTags(HTML).lower() # convert to lower case
#wordlist = text.split()
wordlist = obo.stripNonAlphaNum(text) # RegEx and split done together
print(wordlist[0:150])

Exemplo n.º 18

0

Exibir arquivo

Arquivo: html-to-kwic.py Projeto: jdungan/oscn-scrape

# html-to-kwic.py

import obo

# create dictionary of n-grams
n = 7
url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

text = obo.webPageToText(url)
fullwordlist = ('# ' * (n//2)).split()
fullwordlist += obo.stripNonAlphaNum(text)
fullwordlist += ('# ' * (n//2)).split()
ngrams = obo.getNGrams(fullwordlist, n)
worddict = obo.nGramsToKWICDict(ngrams)

# output KWIC and wrap with html
target = 'black'
outstr = '<pre>'
if worddict.has_key(target):
    for k in worddict[target]:
        outstr += obo.prettyPrintKWIC(k)
        outstr += '<br />'
else:
    outstr += 'Keyword not found in source'

outstr += '</pre>'
obo.wrapStringInHTMLMac('html-to-kwic', url, outstr)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: testGenSim.py Projeto: rocksonchang/Science-Trends

years = range(1992, 1995)
years = range(1995, 1998)
years = range(1998, 2001)
years = range(2001, 2004)
years = range(2004, 2007)
years = range(2007, 2010)
years = range(2010, 2013)
years = range(2013, 2017)
scoreList = []
title = []
for year in years:
    for event, elem in ET.iterparse('arXiv-meta-{}.xml'.format(year)):
        if elem.tag == 'description':
            #queryList.append(elem.text)
            query = elem.text
            queryStripped = obo.removeStopwords(obo.stripNonAlphaNum(query),
                                                obo.stopwords)
            vec_bow = dictionary.doc2bow(queryStripped)
            vec_lsi = lsi[vec_bow]
            score = (vec_lsi[0][1], vec_lsi[1][1])
            scoreList.append(score)
        if elem.tag == 'title':
            title.append(elem.text)

x = [a[0] for a in scoreList]
y = [a[1] for a in scoreList]

import matplotlib.pyplot as plt
import mpld3
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'), figsize=(18, 12))
scatter = ax.scatter(x, y, y, alpha=0.3, cmap=plt.cm.jet)

Exemplo n.º 20

0

Exibir arquivo

#html-to-freq.py

import urllib2, obo

url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
wordlist = obo.stripNonAlphaNum(text)
dictionary = obo.wordListToFreqDict(wordlist)
sorteddict = obo.sortFreqDict(dictionary)

for s in sorteddict: print(str(s))

Exemplo n.º 21

0

Exibir arquivo

    def post(self, request):
        
        pprint.pprint(request.POST)
        pprint.pprint(request.FILES)
        
        #Language check
        if request.POST['language'] not in ['english', 'dutch']:
            return jsonify(status='error', message="Invalid language!")
            
        if request.POST['database'] not in connections:
            return jsonify(status='error', message="Invalid database!")
        
        #Input normalization
        if request.POST['upload_option'] == 'text_field':
            input_text = request.POST['upload_textarea']
        elif request.POST['upload_option'] == 'url':
            page_text = requests.get(request.POST['upload_url']).text
            soup = BeautifulSoup(page_text, "html.parser")
            input_text = soup.text
        elif request.POST['upload_option'] == 'file':
            input_text = UnicodeDammit(request.FILES['upload_file'].read()).unicode_markup
        elif request.POST['upload_option'] == 'news_comments':
            start_date_text = request.POST['news_comments_start_date']
            end_date_text = request.POST['news_comments_end_date']
            start_date = datetime.date(*[int(i) for i in start_date_text.split('-')])
            end_date = datetime.date(*[int(i) for i in end_date_text.split('-')])
            filters = {
                'date__gte': start_date,   
                'date__lte': end_date,
                'text__isnull': False
            }
            input_text = ""
            if 'news' in request.POST['news_comments']:
                queryset = Newsitem.objects\
                                   .using(request.POST['database'])\
                                   .filter(**filters)\
                                   .select_related('text')
                for newsitem in queryset:
                    input_text += "\n"+newsitem.text.text
            if 'comments' in request.POST['news_comments']:
                for comment in Comment.objects\
                                      .using(request.POST['database'])\
                                      .filter(**filters)\
                                      .select_related('text'):
                    input_text += "\n"+comment.text.text            
        #Stemmer selection
        if request.POST['stemmer'] == 'no_stemmer':
            stemmer = None
        elif request.POST['stemmer'] == 'porter':
            if request.POST['language'] != 'english':
                return jsonify(status='error', message="Invalid language for stemmer porter!")
            stemmer = PorterStemmer()
        elif request.POST['stemmer'] == 'snowball':
            stemmer = SnowballStemmer(request.POST['language'])
        else:
            return jsonify(status='error', message="Invalid stemmer!")
                
        #Lemmatizer selection
        if request.POST['lemmatizer'] == 'lemmatizer_off':
            lemmatizer = None
        elif request.POST['language'] == 'english':
            lemmatizer = lemmatizer_en
        else:
            lemmatizer = lemmatizer_nl
            
        #Stopwords selection    
        if request.POST['stopwords'] == 'no_stopwords':    
            stopwords = None
        elif request.POST['stopwords'] == 'our_stopwords':
            stopwords = obo.stopwords
        elif request.POST['stopwords'] == 'custom_stopwords':
            custom_stopword_text = UnicodeDammit(request.FILES.get('custom_stopword_file').read()).unicode_markup
            stopwords = obo.stripNonAlphaNum(custom_stopword_text)
            
        #Process the text  
        input_text_word_count = 0
        resulting_text = ""
        final_wordlist = []
        for word_type, word in text_processor.parse_text(input_text):
            if word_type == "non-word":
                resulting_text += word
            else:
                input_text_word_count += 1
                processed_word = word
                if stemmer:
                    processed_word = stemmer.stem(processed_word)
                if lemmatizer:
                    processed_word = lemmatizer(processed_word)
                if not stopwords or processed_word not in stopwords:
                    if request.POST['exclude_vowels'] == 'exclude_vowels_yes':
                        if request.POST['language'] == 'english':
                            regex = re_vowel_en
                        else:
                            regex = re_vowel_nl
                        processed_word = regex.sub("", processed_word)
                    resulting_text += processed_word
                    final_wordlist.append(processed_word)
          
        dictionary = obo.wordListToFreqDict(final_wordlist)
        sorteddict = obo.sortFreqDict(dictionary)   
          
        ignore_results_amount = int(request.POST['ignore_results_amount'])  
          
        if ignore_results_amount > 0:
            initial_index = ignore_results_amount
            ignored_words = [word for rank, word in sorteddict[:initial_index]]
            sorteddict = sorteddict[initial_index:]    
            new_text = ""
            new_wordlist = []
            for word_type, word in text_processor.parse_text(resulting_text):
                if word_type == "non-word":
                    new_text += word
                elif word not in ignored_words:
                    new_text += word
                    new_wordlist.append(word)
            resulting_text = new_text
            final_wordlist = new_wordlist
                    
        else:
            initial_index = 0          
          
        #Do the math!    
        input_text_char_count = len(input_text)
        word_count = len(final_wordlist)    
        distinct_words_count = len(sorteddict)
        words = []
        frequencies = []
        word_cloud = []
        for frequency, word in sorteddict:
            words.append(word)
            frequencies.append(frequency)
            word_cloud.append([word, frequency])

        acum_perc = Decimal(0)
        percentages = []
        acum_perc_list = []
        for freq in frequencies:
            perc = Decimal((freq*100.0)/word_count)
            percentages.append(round(perc, 2))
            acum_perc += perc
            acum_perc_list.append(round(acum_perc, 2))
            
            
        logarithms = []    
        for i in range(len(sorteddict)):    
            logarithms.append((math.log(i+1), math.log(frequencies[i])))
            
        #Calculate Linear regression
        #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq
        x = numpy.array([math.log(f) for f in frequencies])
        y = numpy.array([math.log(rank) for rank in range(1, distinct_words_count + 1)])
        A = numpy.vstack([x, numpy.ones(len(x))]).T
        m, c = numpy.linalg.lstsq(A, y)[0]
        
        #Calculate the regression line start and end, 
        #  and sort making the start be the one with the lower X value
        #  (highcharts requires this)
        regline_start = (0, c)
        regline_end = (math.log(distinct_words_count), math.log(distinct_words_count) * m + c)
        regression_line = {
            'start': regline_start,
            'end': regline_end
        }
            
        return JsonResponse({
           'status': 'success', 
           'words': words,
           'frequencies': frequencies,
           'percentages': percentages,
           'acum_perc_list': acum_perc_list,
           'logarithms': logarithms,
           'regression_line': regression_line,
           'resulting_text': resulting_text,
           'input_text_char_count': input_text_char_count,
           'input_text_word_count': input_text_word_count,
           'output_text_word_count': word_count,
           'word_cloud': word_cloud,
           'sorteddict': sorteddict
        })