Python wordListToFreqDict 예제들, obo.wordListToFreqDict Python 예제들

예제 #1

0

파일 보기

def graphAndEvaluation(tf=''):
    file = open(tf, encoding="Latin-1")
    text = file.read()
    file.close()

    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.rabinKarp1(fullwordlist, obo.stopwords, 101)
    wordlist1 = obo.rabinKarp2(wordlist, posiwordlist, 101)
    wordlist2 = obo.rabinKarp2(wordlist, negawordlist, 101)

    wordString = listToString(wordlist)
    dictionary = obo.wordListToFreqDict(wordlist)
    dictionaryposi = obo.wordListToFreqDict(wordlist1)
    dictionarynega = obo.wordListToFreqDict(wordlist2)
    sorteddict = obo.sortFreqDict(dictionary)
    sorteddictposi = obo.sortFreqDict(dictionaryposi)
    sorteddictnega = obo.sortFreqDict(dictionarynega)

    # for s in sorteddictposi: print(str(s))

    N = 100000
    t = list(dictionary.keys())
    y = list(dictionary.values())
    fig1 = go.Figure(data=go.Scatter(x=t, y=y, mode='markers'))
    fig1.update_layout(
        title={
            'text': tf + " Word Counts",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        })
    fig1.show()

    x1 = wordlist1
    x2 = wordlist2

    fig = go.Figure()
    fig.update_layout(
        title={
            'text': tf + " Negative and Positive Histogram",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        })
    fig.add_trace(go.Histogram(histfunc="sum", x=x1, name="Positive Word"))
    fig.add_trace(go.Histogram(histfunc="sum", x=x2, name="Negative Word"))

    fig.show()

예제 #2

0

파일 보기

파일: keywords.py 프로젝트: digcat/cmistest

def getKeywords(pdfFile,Occur):

   tikaurl= tika_obo.getTikaAddress()
   parsed = parser.from_file(pdfFile, tikaurl)

   metadata = parsed["metadata"]
   doccontent = parsed["content"]

   fullwordlist = obo.stripNonAlphaNum(doccontent)
   wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
   dictionary = obo.wordListToFreqDict(wordlist)
   sorteddict = obo.sortFreqDict(dictionary)
   count = 0
   keywords = [] 
   shortkey = []
   maxoccur = Occur
   for s in sorteddict: 
       numocc = int(s[0])
       word = s[1].encode('utf-8')
       if numocc > maxoccur:
          keyword = { word : str(numocc) }
          keywords.append(keyword)
          if len(word)>6:
             shortkey.append(word.lower())
       count = count + 1
   if Occur > 0:
       return shortkey
   return keywords

예제 #3

0

파일 보기

def count():
    form = WordForm()
    if form.validate_on_submit():
        url = form.url.data
        response = requests.get(url)
        html = response.content.decode("utf-8")
        text = obo.stripTags(html).lower()
        fullwordlist = obo.stripNonAlphaNum(text)
        wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
        dictionary = obo.wordListToFreqDict(wordlist)
        sorteddict = obo.sortFreqDict(dictionary)
        for s in sorteddict[:21]:
            flash(str(s))
        return redirect(url_for('index'))
    return render_template('count.html',
                           title='Word Count Application',
                           form=form)

예제 #4

0

파일 보기

파일: lolmax.py 프로젝트: tiwansh/phishDetector

def hitString(limit, url):
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "lxml")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # get text
    text = soup.get_text()
    text = text.lower()
    '''# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)'''

    #text = text.lower()
    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
    dictionary = obo.wordListToFreqDict(wordlist)
    sorteddict = obo.sortFreqDict(dictionary)
    count = 0
    global retStr
    for s in sorteddict:
        mys = (str(s[1:]))
        mys2 = mys.strip("(,)")
        mys3 = mys2[1:]
        mys4 = str(mys3.strip("''"))
        mys4 = str(mys4)
        if re.match("^-?[0-9]+$", mys4):
            continue
        count += 1
        #print mys4
        retStr += str(mys4) + ' '
        if count == limit:
            break
    return retStr

예제 #5

0

파일 보기

파일: count_words.py 프로젝트: wwarodom/scrap_extract

def main():
    f = open("output.xls", "r")
    fsort = open("output-sort.xls", "w")

    wordstring = f.read() 
    #print('wordstring: ', wordstring)
    wordlist = wordstring.split() 
    #print('startlist 0: ', wordlist[0])
    #print('startlist 1: ', wordlist[1])
    #fsort.write(wordlist[1])

    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w))

        #print("String\n" + wordstring +"\n")
        #print("List\n" + str(wordlist) + "\n")
        #print("Frequencies\n" + str(wordfreq) + "\n")
        #print("Pairs\n" + str(list(zip(wordlist, wordfreq))))
        #fsort.write(str(list(zip(wordlist, wordfreq)))  )
        dictionary = obo.wordListToFreqDict(wordlist)
        sorteddict = obo.sortFreqDict(dictionary)

        #fsort.write('Word, Count\n')

    for s in sorteddict: 
      	fsort.write(s[1])
       	fsort.write('|')
       	fsort.write(str(s[0]))
       	fsort.write('\n')
        #for s in sorteddict: print(s[1])

#    for x in range(10):
#    	print(sorteddict[x][1])
#        fsort.write(sorteddict[x][1]) 
    f.close()
    fsort.close()

예제 #6

0

파일 보기

#html-to-freq.py

import urllib2, obo

url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
wordlist = obo.stripNonAlphaNum(text)
dictionary = obo.wordListToFreqDict(wordlist)
sorteddict = obo.sortFreqDict(dictionary)

for s in sorteddict: print(str(s))

예제 #7

0

파일 보기

파일: html-to-freq-2.py 프로젝트: justinminsk/IntroToPython

import requests, obo
url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html'
pagetext = requests.get(url)
HTML = pagetext.text
text = obo.stripTags(HTML).lower()  # convert to lower case
fullwordlist = obo.stripNonAlphaNum(text)  # only words, into list
wordlist = obo.removeStopwords(fullwordlist,
                               obo.stopwords)  # remove common useless words
dictionary = obo.wordListToFreqDict(
    wordlist)  # add words and counts to dictionary
sorteddict = obo.sortFreqDict(dictionary)  # sort word list by frequency

if __name__ == '__main__':
    for s in sorteddict:
        print(str(s))

예제 #8

0

파일 보기

파일: arXiv-metadata-process.py 프로젝트: rocksonchang/arXiv-metadata

		text=title+desc

		date=datetime.strptime(dates[rec],'%Y-%m-%d')		
		Q=int(ceil(date.month/3.)-1)
		ind = 4*(date.year-year0)+Q
		NRecQuarter[ind]+=1

		fulltextQuarter[Q]=fulltextQuarter[Q]+text		
		
	NRecRunning=NRecRunning+NRecYear[x]
	

	for q in range(4):
		desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q])
		desc_wordlist = obo.removeStopwords(desc_fullwordlist,obo.stopwords)	
		desc_dictionary = obo.wordListToFreqDict(desc_wordlist)
		desc_sorteddict = obo.sortFreqDict(desc_dictionary)

		topWords.append(desc_sorteddict[:5000])
		
		print ('Year: {}; Quarter: Q{}; Num. entries: {}'.format(years[x],q+1,NRecQuarter[4*(date.year-year0)+q]))				
		#for s in desc_sorteddict[:10]: 	print(str(s))
		#print('\n')
print('\n')
	
#################################################################################################
#################################################################################################
## Pickle?
## Pickle?
with open('obj/'+ 'NRecQuarter' + '.pkl', 'wb') as f:
	pickle.dump(NRecQuarter, f, pickle.HIGHEST_PROTOCOL)

예제 #9

0

파일 보기

파일: diccio.py 프로젝트: QKRacha/Diccio

    print(
        "Formato de la web erroneo, debe ser del estilo http://www.google.com")

if statusCode == 200:
    text = obo.stripTags(r.text)  #quitamos las etiquetas y pasamos a minuscula
    fullwordlist = obo.stripNonAlphaNum(
        text)  #quitamos los que no son alfanumericos
    if args.stopwords:
        fullwordlist = obo.removeStopwords(
            fullwordlist, args.stopwords
        )  #eliminamos las palabras de uso comun segun el idioma
    if args.long:
        fullwordlist = obo.excludeTwo(
            fullwordlist,
            args.long)  #eliminamos las palabras con menos de 2 caracteres
    dictionary = obo.wordListToFreqDict(
        fullwordlist)  #nos devuelve un diccionario palabra - frequencia
    sorteddict = obo.sortFreqDict(
        dictionary
    )  #ordena las palabras por su frequencia (nos han devuelto una lista de listas)
    if args.tipo == 'simple':
        obo.makePassfile(sorteddict,
                         args.file)  #crea el primer archivo de pass.txt
        print('Archivo simple creado correctamente:' + args.file)
    elif args.tipo == 'numin':
        obo.makePassfile(sorteddict,
                         args.file)  #crea el primer archivo de pass.txt
        obo.numInside(args.file, args.numint)  #crea el archivo passInt.txt
        print('Archivo con numeros en el interior creado correctamente:',
              args.file)
    elif args.tipo == 'numout':
        obo.makePassfile(sorteddict,

예제 #10

0

파일 보기

파일: arXiv-metadata-process.py 프로젝트: rocksonchang/arXiv-metadata

        title = titles[rec]
        text = title + desc

        date = datetime.strptime(dates[rec], '%Y-%m-%d')
        Q = int(ceil(date.month / 3.) - 1)
        ind = 4 * (date.year - year0) + Q
        NRecQuarter[ind] += 1

        fulltextQuarter[Q] = fulltextQuarter[Q] + text

    NRecRunning = NRecRunning + NRecYear[x]

    for q in range(4):
        desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q])
        desc_wordlist = obo.removeStopwords(desc_fullwordlist, obo.stopwords)
        desc_dictionary = obo.wordListToFreqDict(desc_wordlist)
        desc_sorteddict = obo.sortFreqDict(desc_dictionary)

        topWords.append(desc_sorteddict[:5000])

        print('Year: {}; Quarter: Q{}; Num. entries: {}'.format(
            years[x], q + 1, NRecQuarter[4 * (date.year - year0) + q]))
        #for s in desc_sorteddict[:10]: 	print(str(s))
        #print('\n')
print('\n')

#################################################################################################
#################################################################################################
## Pickle?
## Pickle?
with open('obj/' + 'NRecQuarter' + '.pkl', 'wb') as f:

예제 #11

0

파일 보기

파일: main.py 프로젝트: TheResearchProject/textprocessing

def hello_world():
    if request.method == "GET":
        return redirect("/app/index.html")
    else:
        pprint.pprint(request.form)
        pprint.pprint(request.files)

        #Language check
        if request.form['language'] not in ['english', 'dutch']:
            return jsonify(status='error', message="Invalid language!")

        #Input normalization
        if request.form['upload_option'] == 'text_field':
            input_text = request.form['upload_textarea']
        elif request.form['upload_option'] == 'url':
            page_text = requests.get(request.form['upload_url']).text
            soup = BeautifulSoup(page_text, "html.parser")
            input_text = soup.text
        elif request.form['upload_option'] == 'file':
            input_text = UnicodeDammit(
                request.files.get('upload_file').read()).unicode_markup

        #Stemmer selection
        if request.form['stemmer'] == 'no_stemmer':
            stemmer = None
        elif request.form['stemmer'] == 'porter':
            if request.form['language'] != 'english':
                return jsonify(status='error',
                               message="Invalid language for stemmer porter!")
            stemmer = PorterStemmer()
        elif request.form['stemmer'] == 'snowball':
            stemmer = SnowballStemmer(request.form['language'])
        else:
            return jsonify(status='error', message="Invalid stemmer!")

        #Lemmatizer selection
        if request.form['lemmatizer'] == 'lemmatizer_off':
            lemmatizer = None
        elif request.form['language'] == 'english':
            lemmatizer = lemmatizer_en
        else:
            lemmatizer = lemmatizer_nl

        #Stopwords selection
        if request.form['stopwords'] == 'no_stopwords':
            stopwords = None
        elif request.form['stopwords'] == 'our_stopwords':
            stopwords = obo.stopwords
        elif request.form['stopwords'] == 'custom_stopwords':
            custom_stopword_text = UnicodeDammit(
                request.files.get(
                    'custom_stopword_file').read()).unicode_markup
            stopwords = obo.stripNonAlphaNum(custom_stopword_text)

        #Process the text
        input_text_word_count = 0
        resulting_text = ""
        final_wordlist = []
        for word_type, word in text_processor.parse_text(input_text):
            if word_type == "non-word":
                resulting_text += word
            else:
                input_text_word_count += 1
                processed_word = word
                if stemmer:
                    processed_word = stemmer.stem(processed_word)
                if lemmatizer:
                    processed_word = lemmatizer(processed_word)
                if not stopwords or processed_word not in stopwords:
                    if request.form['exclude_vowels'] == 'exclude_vowels_yes':
                        if request.form['language'] == 'english':
                            regex = re_vowel_en
                        else:
                            regex = re_vowel_nl
                        processed_word = regex.sub("", processed_word)
                    resulting_text += processed_word
                    final_wordlist.append(processed_word)

        dictionary = obo.wordListToFreqDict(final_wordlist)
        sorteddict = obo.sortFreqDict(dictionary)

        ignore_results_amount = int(request.form['ignore_results_amount'])

        if ignore_results_amount > 0:
            initial_index = ignore_results_amount
            ignored_words = [word for rank, word in sorteddict[:initial_index]]
            sorteddict = sorteddict[initial_index:]
            new_text = ""
            new_wordlist = []
            for word_type, word in text_processor.parse_text(resulting_text):
                if word_type == "non-word":
                    new_text += word
                elif word not in ignored_words:
                    new_text += word
                    new_wordlist.append(word)
            resulting_text = new_text
            final_wordlist = new_wordlist

        else:
            initial_index = 0

        #Do the math!
        input_text_char_count = len(input_text)
        word_count = len(final_wordlist)
        distinct_words_count = len(sorteddict)
        words = []
        frequencies = []
        word_cloud = []
        for frequency, word in sorteddict:
            words.append(word)
            frequencies.append(frequency)
            word_cloud.append([word, frequency])

        acum_perc = Decimal(0)
        percentages = []
        acum_perc_list = []
        for freq in frequencies:
            perc = Decimal((freq * 100.0) / word_count)
            percentages.append(round(perc, 2))
            acum_perc += perc
            acum_perc_list.append(round(acum_perc, 2))

        logarithms = []
        for i in range(len(sorteddict)):
            logarithms.append((math.log(i + 1), math.log(frequencies[i])))

        #Calculate Linear regression
        #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq
        x = numpy.array([math.log(f) for f in frequencies])
        y = numpy.array(
            [math.log(rank) for rank in range(1, distinct_words_count + 1)])
        A = numpy.vstack([x, numpy.ones(len(x))]).T
        m, c = numpy.linalg.lstsq(A, y)[0]

        #Calculate the regression line start and end,
        #  and sort making the start be the one with the lower X value
        #  (highcharts requires this)
        regline_start = (0, c)
        regline_end = (math.log(distinct_words_count),
                       math.log(distinct_words_count) * m + c)
        regression_line = {'start': regline_start, 'end': regline_end}

        return jsonify(status='success',
                       words=words,
                       frequencies=frequencies,
                       percentages=percentages,
                       acum_perc_list=acum_perc_list,
                       logarithms=logarithms,
                       regression_line=regression_line,
                       resulting_text=resulting_text,
                       input_text_char_count=input_text_char_count,
                       input_text_word_count=input_text_word_count,
                       output_text_word_count=word_count,
                       word_cloud=word_cloud,
                       sorteddict=sorteddict)

예제 #12

0

파일 보기

    def post(self, request):
        
        pprint.pprint(request.POST)
        pprint.pprint(request.FILES)
        
        #Language check
        if request.POST['language'] not in ['english', 'dutch']:
            return jsonify(status='error', message="Invalid language!")
            
        if request.POST['database'] not in connections:
            return jsonify(status='error', message="Invalid database!")
        
        #Input normalization
        if request.POST['upload_option'] == 'text_field':
            input_text = request.POST['upload_textarea']
        elif request.POST['upload_option'] == 'url':
            page_text = requests.get(request.POST['upload_url']).text
            soup = BeautifulSoup(page_text, "html.parser")
            input_text = soup.text
        elif request.POST['upload_option'] == 'file':
            input_text = UnicodeDammit(request.FILES['upload_file'].read()).unicode_markup
        elif request.POST['upload_option'] == 'news_comments':
            start_date_text = request.POST['news_comments_start_date']
            end_date_text = request.POST['news_comments_end_date']
            start_date = datetime.date(*[int(i) for i in start_date_text.split('-')])
            end_date = datetime.date(*[int(i) for i in end_date_text.split('-')])
            filters = {
                'date__gte': start_date,   
                'date__lte': end_date,
                'text__isnull': False
            }
            input_text = ""
            if 'news' in request.POST['news_comments']:
                queryset = Newsitem.objects\
                                   .using(request.POST['database'])\
                                   .filter(**filters)\
                                   .select_related('text')
                for newsitem in queryset:
                    input_text += "\n"+newsitem.text.text
            if 'comments' in request.POST['news_comments']:
                for comment in Comment.objects\
                                      .using(request.POST['database'])\
                                      .filter(**filters)\
                                      .select_related('text'):
                    input_text += "\n"+comment.text.text            
        #Stemmer selection
        if request.POST['stemmer'] == 'no_stemmer':
            stemmer = None
        elif request.POST['stemmer'] == 'porter':
            if request.POST['language'] != 'english':
                return jsonify(status='error', message="Invalid language for stemmer porter!")
            stemmer = PorterStemmer()
        elif request.POST['stemmer'] == 'snowball':
            stemmer = SnowballStemmer(request.POST['language'])
        else:
            return jsonify(status='error', message="Invalid stemmer!")
                
        #Lemmatizer selection
        if request.POST['lemmatizer'] == 'lemmatizer_off':
            lemmatizer = None
        elif request.POST['language'] == 'english':
            lemmatizer = lemmatizer_en
        else:
            lemmatizer = lemmatizer_nl
            
        #Stopwords selection    
        if request.POST['stopwords'] == 'no_stopwords':    
            stopwords = None
        elif request.POST['stopwords'] == 'our_stopwords':
            stopwords = obo.stopwords
        elif request.POST['stopwords'] == 'custom_stopwords':
            custom_stopword_text = UnicodeDammit(request.FILES.get('custom_stopword_file').read()).unicode_markup
            stopwords = obo.stripNonAlphaNum(custom_stopword_text)
            
        #Process the text  
        input_text_word_count = 0
        resulting_text = ""
        final_wordlist = []
        for word_type, word in text_processor.parse_text(input_text):
            if word_type == "non-word":
                resulting_text += word
            else:
                input_text_word_count += 1
                processed_word = word
                if stemmer:
                    processed_word = stemmer.stem(processed_word)
                if lemmatizer:
                    processed_word = lemmatizer(processed_word)
                if not stopwords or processed_word not in stopwords:
                    if request.POST['exclude_vowels'] == 'exclude_vowels_yes':
                        if request.POST['language'] == 'english':
                            regex = re_vowel_en
                        else:
                            regex = re_vowel_nl
                        processed_word = regex.sub("", processed_word)
                    resulting_text += processed_word
                    final_wordlist.append(processed_word)
          
        dictionary = obo.wordListToFreqDict(final_wordlist)
        sorteddict = obo.sortFreqDict(dictionary)   
          
        ignore_results_amount = int(request.POST['ignore_results_amount'])  
          
        if ignore_results_amount > 0:
            initial_index = ignore_results_amount
            ignored_words = [word for rank, word in sorteddict[:initial_index]]
            sorteddict = sorteddict[initial_index:]    
            new_text = ""
            new_wordlist = []
            for word_type, word in text_processor.parse_text(resulting_text):
                if word_type == "non-word":
                    new_text += word
                elif word not in ignored_words:
                    new_text += word
                    new_wordlist.append(word)
            resulting_text = new_text
            final_wordlist = new_wordlist
                    
        else:
            initial_index = 0          
          
        #Do the math!    
        input_text_char_count = len(input_text)
        word_count = len(final_wordlist)    
        distinct_words_count = len(sorteddict)
        words = []
        frequencies = []
        word_cloud = []
        for frequency, word in sorteddict:
            words.append(word)
            frequencies.append(frequency)
            word_cloud.append([word, frequency])

        acum_perc = Decimal(0)
        percentages = []
        acum_perc_list = []
        for freq in frequencies:
            perc = Decimal((freq*100.0)/word_count)
            percentages.append(round(perc, 2))
            acum_perc += perc
            acum_perc_list.append(round(acum_perc, 2))
            
            
        logarithms = []    
        for i in range(len(sorteddict)):    
            logarithms.append((math.log(i+1), math.log(frequencies[i])))
            
        #Calculate Linear regression
        #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq
        x = numpy.array([math.log(f) for f in frequencies])
        y = numpy.array([math.log(rank) for rank in range(1, distinct_words_count + 1)])
        A = numpy.vstack([x, numpy.ones(len(x))]).T
        m, c = numpy.linalg.lstsq(A, y)[0]
        
        #Calculate the regression line start and end, 
        #  and sort making the start be the one with the lower X value
        #  (highcharts requires this)
        regline_start = (0, c)
        regline_end = (math.log(distinct_words_count), math.log(distinct_words_count) * m + c)
        regression_line = {
            'start': regline_start,
            'end': regline_end
        }
            
        return JsonResponse({
           'status': 'success', 
           'words': words,
           'frequencies': frequencies,
           'percentages': percentages,
           'acum_perc_list': acum_perc_list,
           'logarithms': logarithms,
           'regression_line': regression_line,
           'resulting_text': resulting_text,
           'input_text_char_count': input_text_char_count,
           'input_text_word_count': input_text_word_count,
           'output_text_word_count': word_count,
           'word_cloud': word_cloud,
           'sorteddict': sorteddict
        })