/
expt.py
93 lines (72 loc) · 3.1 KB
/
expt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# import BOSS, requests
# from gensim import corpora, models
### First, use BOSS to generate queries ###
from requests_oauthlib import OAuth1Session
request_token_url = 'https://api.login.yahoo.com/oauth/v2/get_request_token'
oauth = OAuth1Session(client_key, client_secret = client_secret)
fetch_response = oauth.fetch_request_token(request_token_url)
resource_owner_key = fetch_response.get('oauth_token')
resource_owner_secret = fetch_response.get('oauth_token_secret')
### Second, for each hit, do this bigggg loop
### where the pages are pulled from the web
### and their contents are extracted from HTML
###
### (maybe also run a spellcheck)
import requests
from bs4 import BeautifulSoup
def extract(url):
page = requests.get(url)
soup = BeautifulSoup(page.content)
return soup.get_text()
### Maybe do some cleaning?
docs = [extract(hit) for hit in results]
### Third, tokenize each doc
import re, string
def tokenize(text, stop_list):
a = text.lower().translate(string.maketrans("",""), string.punctuation + string.digits)
b = re.replace(r'(\s+)', ' ', a)
c = b.split(' ')
### This might be too simple (simply deletes punctuation, then elims
### double spaces)
### Maybe eliminate some words here?
### It seems like code blocks get grabbed sometimes, so elim those
### To keep or not to keep numbers?
return c
stop_list = []
tokenized = [tokenize(doc, stop_list) for doc in docs]
### Create bag-of-words corpus
dic = corpora.Dictionary(tokenized)
corpus = [dic.doc2bow(text) for text in tokenized]
##### Alternative that saves space
docs = [clean(extract(hit)) for hit in results]
open('mycorpus.txt', 'w').write('\n'.join(docs))
dic = corpora.Dictionary(docs)
class MyCorpus(object):
def __iter__(self):
for line in open('mycorpus.txt'):
yield dic.doc2bow(line)
##### Similarly
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
#### in any case, once corpus constructed as bow, can serialize corpus:
corpora.MmCorpus.serialize(corpus)
corpus = corpora.MmCorpus('/tmp/corpus.mm')
### Then, convert bow to tfidf
tfidf = models.TfidfModel(corpus) # object converting bow to tfidf
tfidf.save('./tmp/model.tfidf')
tfidf = models.TfidfModel.load('./tmp/mode.tfidf')
corpora.MmCorpus.serialize(tfidf[corpus])
tfidf = corpora.MmCorpus('/tmp/tfidf.mm')
### Topic models
model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics)
model = rpmodel.Rpmodel(tfidf_corpus, num_topics = 500)
model = ldamodel.LdaModel(bow_corpus, id2word = dictionary, num_topics)
model = hdpmodel.HdpModel(bow_corpus, id2word = dictionary)
model.print_topics