Пример #1
0
def getKeywords(pdfFile,Occur):

   tikaurl= tika_obo.getTikaAddress()
   parsed = parser.from_file(pdfFile, tikaurl)

   metadata = parsed["metadata"]
   doccontent = parsed["content"]

   fullwordlist = obo.stripNonAlphaNum(doccontent)
   wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
   dictionary = obo.wordListToFreqDict(wordlist)
   sorteddict = obo.sortFreqDict(dictionary)
   count = 0
   keywords = [] 
   shortkey = []
   maxoccur = Occur
   for s in sorteddict: 
       numocc = int(s[0])
       word = s[1].encode('utf-8')
       if numocc > maxoccur:
          keyword = { word : str(numocc) }
          keywords.append(keyword)
          if len(word)>6:
             shortkey.append(word.lower())
       count = count + 1
   if Occur > 0:
       return shortkey
   return keywords
 def __iter__(self):
     for year in years:
         fname = 'arXiv-meta-{}.xml'.format(year)
         #path=os.path.dirname(os.getcwd())+'\\DATA\\'+fname
         path = os.path.dirname(os.getcwd()) + '/DATA/SORTED/' + fname
         for event, elem in ET.iterparse(path):
             if elem.tag == 'description':
                 desc = obo.removeStopwords(obo.stripNonAlphaNum(elem.text),
                                            obo.stopwords)
                 yield dictionary.doc2bow(desc)
Пример #3
0
def count():
    form = WordForm()
    if form.validate_on_submit():
        url = form.url.data
        response = requests.get(url)
        html = response.content.decode("utf-8")
        text = obo.stripTags(html).lower()
        fullwordlist = obo.stripNonAlphaNum(text)
        wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
        dictionary = obo.wordListToFreqDict(wordlist)
        sorteddict = obo.sortFreqDict(dictionary)
        for s in sorteddict[:21]:
            flash(str(s))
        return redirect(url_for('index'))
    return render_template('count.html',
                           title='Word Count Application',
                           form=form)
Пример #4
0
def hitString(limit, url):
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "lxml")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # get text
    text = soup.get_text()
    text = text.lower()
    '''# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)'''

    #text = text.lower()
    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
    dictionary = obo.wordListToFreqDict(wordlist)
    sorteddict = obo.sortFreqDict(dictionary)
    count = 0
    global retStr
    for s in sorteddict:
        mys = (str(s[1:]))
        mys2 = mys.strip("(,)")
        mys3 = mys2[1:]
        mys4 = str(mys3.strip("''"))
        mys4 = str(mys4)
        if re.match("^-?[0-9]+$", mys4):
            continue
        count += 1
        #print mys4
        retStr += str(mys4) + ' '
        if count == limit:
            break
    return retStr
Пример #5
0
    out.write(msg)
    out.close()


for name, file in srts.items():
    #print(name, file)
    wordstring = ""
    subs = pysrt.open(file)

    for i in subs:
        wordstring += i.text + " "

    text = wordstring.lower()

    fullwordlist = obo.stripNonAlphaNum(text)
    wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
    wordset = set(wordlist)
    dictionary = obo.wordListToFreqDict(wordlist)
    sorteddict = obo.sortFreqDict(dictionary)

    write_freq(sorteddict, name + "_word_freq.txt")

    info[name] = {}
    info[name]["wordset"] = wordset
    info[name]["sorteddict"] = sorteddict

intersect = info['paddington']["wordset"].intersection(
    info["corpse"]["wordset"])

with open("intersection.txt", "w") as f:
    for i in intersect:
Пример #6
0
from gensim import corpora
import obo
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

## specify range of data
years = range(1992, 2017)
fileName = 'fullDictionary.dict'

## build dictionary from stream, clean up as we go
print('\nGenerating dictionary for years {} to {} ... '.format(
    years[0], years[-1]))
doc_stream = (
    obo.removeStopwords(obo.stripNonAlphaNum(elem.text), obo.stopwords)
    for year in years
    for event, elem in ET.iterparse('RAW/arXiv-meta-{}.xml'.format(year))
    if elem.tag == 'description')
dictionary = corpora.Dictionary(doc_stream)
print('Finished!')
print(dictionary)
## save to file
dictionary.save(fileName)
print('Dictionary saved as {}'.format(fileName))
'''
## some output checks
i=-1
for d in dictionary.token2id:
	i+=1
	if i<20:
Пример #7
0
import requests, obo
url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html'
pagetext = requests.get(url)
HTML = pagetext.text
text = obo.stripTags(HTML).lower()  # convert to lower case
fullwordlist = obo.stripNonAlphaNum(text)  # only words, into list
wordlist = obo.removeStopwords(fullwordlist,
                               obo.stopwords)  # remove common useless words
dictionary = obo.wordListToFreqDict(
    wordlist)  # add words and counts to dictionary
sorteddict = obo.sortFreqDict(dictionary)  # sort word list by frequency

if __name__ == '__main__':
    for s in sorteddict:
        print(str(s))
#################################################################################################
## Perform query
#################################################################################################
query = "quantum simulation phase transition"
query = "quantum error correction shor"
query = "three dimensional doppler cooling theory helium"
#query = "response particle periodic potential effective mass dynamics bose einstein condensate"
query = "anderson localization matter waves"
Ntop = 300
print('\n')
print('QUERY: ' + query)
print('TOP {} hits: '.format(Ntop) + query)

## find similarity of query to articles in corpus
vec_bow = dictionary.doc2bow(
    obo.removeStopwords(obo.stripNonAlphaNum(query), obo.stopwords))
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]  # perform a similarity query against the corpus
sims = sorted(enumerate(sims), key=lambda item: -item[1])

#################################################################################################
## Prepare data for plotting
#################################################################################################
## prepare data for scatter plot -- article index, score, date, titles
iArr = [x[0] for x in sims[:Ntop]]
sArr = [x[1] for x in sims[:Ntop]]
dArr = [year[i] for i in iArr]
labelsArr = [titles[i] for i in iArr]
datesArr = [dates[i] for i in iArr]

## prepare data for line plot -- data binned into quarters
		title=titles[rec]
		text=title+desc

		date=datetime.strptime(dates[rec],'%Y-%m-%d')		
		Q=int(ceil(date.month/3.)-1)
		ind = 4*(date.year-year0)+Q
		NRecQuarter[ind]+=1

		fulltextQuarter[Q]=fulltextQuarter[Q]+text		
		
	NRecRunning=NRecRunning+NRecYear[x]
	

	for q in range(4):
		desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q])
		desc_wordlist = obo.removeStopwords(desc_fullwordlist,obo.stopwords)	
		desc_dictionary = obo.wordListToFreqDict(desc_wordlist)
		desc_sorteddict = obo.sortFreqDict(desc_dictionary)

		topWords.append(desc_sorteddict[:5000])
		
		print ('Year: {}; Quarter: Q{}; Num. entries: {}'.format(years[x],q+1,NRecQuarter[4*(date.year-year0)+q]))				
		#for s in desc_sorteddict[:10]: 	print(str(s))
		#print('\n')
print('\n')
	
#################################################################################################
#################################################################################################
## Pickle?
## Pickle?
with open('obj/'+ 'NRecQuarter' + '.pkl', 'wb') as f:
Пример #10
0
statusCode = 0
try:
    r = requests.get(args.url)  #cargamos la respuesta de GET de la pàgina
    statusCode = r.status_code
except:
    print(
        "Formato de la web erroneo, debe ser del estilo http://www.google.com")

if statusCode == 200:
    text = obo.stripTags(r.text)  #quitamos las etiquetas y pasamos a minuscula
    fullwordlist = obo.stripNonAlphaNum(
        text)  #quitamos los que no son alfanumericos
    if args.stopwords:
        fullwordlist = obo.removeStopwords(
            fullwordlist, args.stopwords
        )  #eliminamos las palabras de uso comun segun el idioma
    if args.long:
        fullwordlist = obo.excludeTwo(
            fullwordlist,
            args.long)  #eliminamos las palabras con menos de 2 caracteres
    dictionary = obo.wordListToFreqDict(
        fullwordlist)  #nos devuelve un diccionario palabra - frequencia
    sorteddict = obo.sortFreqDict(
        dictionary
    )  #ordena las palabras por su frequencia (nos han devuelto una lista de listas)
    if args.tipo == 'simple':
        obo.makePassfile(sorteddict,
                         args.file)  #crea el primer archivo de pass.txt
        print('Archivo simple creado correctamente:' + args.file)
    elif args.tipo == 'numin':
        title = titles[rec]
        text = title + desc

        date = datetime.strptime(dates[rec], '%Y-%m-%d')
        Q = int(ceil(date.month / 3.) - 1)
        ind = 4 * (date.year - year0) + Q
        NRecQuarter[ind] += 1

        fulltextQuarter[Q] = fulltextQuarter[Q] + text

    NRecRunning = NRecRunning + NRecYear[x]

    for q in range(4):
        desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q])
        desc_wordlist = obo.removeStopwords(desc_fullwordlist, obo.stopwords)
        desc_dictionary = obo.wordListToFreqDict(desc_wordlist)
        desc_sorteddict = obo.sortFreqDict(desc_dictionary)

        topWords.append(desc_sorteddict[:5000])

        print('Year: {}; Quarter: Q{}; Num. entries: {}'.format(
            years[x], q + 1, NRecQuarter[4 * (date.year - year0) + q]))
        #for s in desc_sorteddict[:10]: 	print(str(s))
        #print('\n')
print('\n')

#################################################################################################
#################################################################################################
## Pickle?
## Pickle?