def getKeywords(pdfFile,Occur): tikaurl= tika_obo.getTikaAddress() parsed = parser.from_file(pdfFile, tikaurl) metadata = parsed["metadata"] doccontent = parsed["content"] fullwordlist = obo.stripNonAlphaNum(doccontent) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) count = 0 keywords = [] shortkey = [] maxoccur = Occur for s in sorteddict: numocc = int(s[0]) word = s[1].encode('utf-8') if numocc > maxoccur: keyword = { word : str(numocc) } keywords.append(keyword) if len(word)>6: shortkey.append(word.lower()) count = count + 1 if Occur > 0: return shortkey return keywords
def __iter__(self): for year in years: fname = 'arXiv-meta-{}.xml'.format(year) #path=os.path.dirname(os.getcwd())+'\\DATA\\'+fname path = os.path.dirname(os.getcwd()) + '/DATA/SORTED/' + fname for event, elem in ET.iterparse(path): if elem.tag == 'description': desc = obo.removeStopwords(obo.stripNonAlphaNum(elem.text), obo.stopwords) yield dictionary.doc2bow(desc)
def count(): form = WordForm() if form.validate_on_submit(): url = form.url.data response = requests.get(url) html = response.content.decode("utf-8") text = obo.stripTags(html).lower() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) for s in sorteddict[:21]: flash(str(s)) return redirect(url_for('index')) return render_template('count.html', title='Word Count Application', form=form)
def hitString(limit, url): response = urllib2.urlopen(url) html = response.read() soup = BeautifulSoup(html, "lxml") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() text = text.lower() '''# break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk)''' #text = text.lower() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) count = 0 global retStr for s in sorteddict: mys = (str(s[1:])) mys2 = mys.strip("(,)") mys3 = mys2[1:] mys4 = str(mys3.strip("''")) mys4 = str(mys4) if re.match("^-?[0-9]+$", mys4): continue count += 1 #print mys4 retStr += str(mys4) + ' ' if count == limit: break return retStr
out.write(msg) out.close() for name, file in srts.items(): #print(name, file) wordstring = "" subs = pysrt.open(file) for i in subs: wordstring += i.text + " " text = wordstring.lower() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) wordset = set(wordlist) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) write_freq(sorteddict, name + "_word_freq.txt") info[name] = {} info[name]["wordset"] = wordset info[name]["sorteddict"] = sorteddict intersect = info['paddington']["wordset"].intersection( info["corpse"]["wordset"]) with open("intersection.txt", "w") as f: for i in intersect:
from gensim import corpora import obo try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET ## specify range of data years = range(1992, 2017) fileName = 'fullDictionary.dict' ## build dictionary from stream, clean up as we go print('\nGenerating dictionary for years {} to {} ... '.format( years[0], years[-1])) doc_stream = ( obo.removeStopwords(obo.stripNonAlphaNum(elem.text), obo.stopwords) for year in years for event, elem in ET.iterparse('RAW/arXiv-meta-{}.xml'.format(year)) if elem.tag == 'description') dictionary = corpora.Dictionary(doc_stream) print('Finished!') print(dictionary) ## save to file dictionary.save(fileName) print('Dictionary saved as {}'.format(fileName)) ''' ## some output checks i=-1 for d in dictionary.token2id: i+=1 if i<20:
import requests, obo url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html' pagetext = requests.get(url) HTML = pagetext.text text = obo.stripTags(HTML).lower() # convert to lower case fullwordlist = obo.stripNonAlphaNum(text) # only words, into list wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) # remove common useless words dictionary = obo.wordListToFreqDict( wordlist) # add words and counts to dictionary sorteddict = obo.sortFreqDict(dictionary) # sort word list by frequency if __name__ == '__main__': for s in sorteddict: print(str(s))
################################################################################################# ## Perform query ################################################################################################# query = "quantum simulation phase transition" query = "quantum error correction shor" query = "three dimensional doppler cooling theory helium" #query = "response particle periodic potential effective mass dynamics bose einstein condensate" query = "anderson localization matter waves" Ntop = 300 print('\n') print('QUERY: ' + query) print('TOP {} hits: '.format(Ntop) + query) ## find similarity of query to articles in corpus vec_bow = dictionary.doc2bow( obo.removeStopwords(obo.stripNonAlphaNum(query), obo.stopwords)) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) ################################################################################################# ## Prepare data for plotting ################################################################################################# ## prepare data for scatter plot -- article index, score, date, titles iArr = [x[0] for x in sims[:Ntop]] sArr = [x[1] for x in sims[:Ntop]] dArr = [year[i] for i in iArr] labelsArr = [titles[i] for i in iArr] datesArr = [dates[i] for i in iArr] ## prepare data for line plot -- data binned into quarters
title=titles[rec] text=title+desc date=datetime.strptime(dates[rec],'%Y-%m-%d') Q=int(ceil(date.month/3.)-1) ind = 4*(date.year-year0)+Q NRecQuarter[ind]+=1 fulltextQuarter[Q]=fulltextQuarter[Q]+text NRecRunning=NRecRunning+NRecYear[x] for q in range(4): desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q]) desc_wordlist = obo.removeStopwords(desc_fullwordlist,obo.stopwords) desc_dictionary = obo.wordListToFreqDict(desc_wordlist) desc_sorteddict = obo.sortFreqDict(desc_dictionary) topWords.append(desc_sorteddict[:5000]) print ('Year: {}; Quarter: Q{}; Num. entries: {}'.format(years[x],q+1,NRecQuarter[4*(date.year-year0)+q])) #for s in desc_sorteddict[:10]: print(str(s)) #print('\n') print('\n') ################################################################################################# ################################################################################################# ## Pickle? ## Pickle? with open('obj/'+ 'NRecQuarter' + '.pkl', 'wb') as f:
statusCode = 0 try: r = requests.get(args.url) #cargamos la respuesta de GET de la pàgina statusCode = r.status_code except: print( "Formato de la web erroneo, debe ser del estilo http://www.google.com") if statusCode == 200: text = obo.stripTags(r.text) #quitamos las etiquetas y pasamos a minuscula fullwordlist = obo.stripNonAlphaNum( text) #quitamos los que no son alfanumericos if args.stopwords: fullwordlist = obo.removeStopwords( fullwordlist, args.stopwords ) #eliminamos las palabras de uso comun segun el idioma if args.long: fullwordlist = obo.excludeTwo( fullwordlist, args.long) #eliminamos las palabras con menos de 2 caracteres dictionary = obo.wordListToFreqDict( fullwordlist) #nos devuelve un diccionario palabra - frequencia sorteddict = obo.sortFreqDict( dictionary ) #ordena las palabras por su frequencia (nos han devuelto una lista de listas) if args.tipo == 'simple': obo.makePassfile(sorteddict, args.file) #crea el primer archivo de pass.txt print('Archivo simple creado correctamente:' + args.file) elif args.tipo == 'numin':
title = titles[rec] text = title + desc date = datetime.strptime(dates[rec], '%Y-%m-%d') Q = int(ceil(date.month / 3.) - 1) ind = 4 * (date.year - year0) + Q NRecQuarter[ind] += 1 fulltextQuarter[Q] = fulltextQuarter[Q] + text NRecRunning = NRecRunning + NRecYear[x] for q in range(4): desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q]) desc_wordlist = obo.removeStopwords(desc_fullwordlist, obo.stopwords) desc_dictionary = obo.wordListToFreqDict(desc_wordlist) desc_sorteddict = obo.sortFreqDict(desc_dictionary) topWords.append(desc_sorteddict[:5000]) print('Year: {}; Quarter: Q{}; Num. entries: {}'.format( years[x], q + 1, NRecQuarter[4 * (date.year - year0) + q])) #for s in desc_sorteddict[:10]: print(str(s)) #print('\n') print('\n') ################################################################################################# ################################################################################################# ## Pickle? ## Pickle?