def __search_results(self, page): start = time.time() if page == 1: results = bs(urlopen(baseURL + queryString + self.searchTerm), parseOnlyThese=ss('a', 'result_primary_link')) else: results = bs(urlopen(baseURL + queryString + self.searchTerm + searchPageString + str(page)), parseOnlyThese=ss('a', 'result_primary_link')) for link in results.contents: if link['result-type'] == 'Talk' and not link[ 'href'] in self.listOfPosts: Investigator.__result(self, link['href']) print "__search_results Elapsed Time: %s" % ( time.time() - start), self.searchTerm, ' page: ', page
def __user(self, user): try: start = time.time() inQueue = Queue() outQueue = Queue() processes = [] links = bs(urlopen(baseURL + user + '/activity'), parseOnlyThese=ss('a', href=re.compile('/post/a.'))) for link in links.contents: if link['href'] not in self.visitedPosts: inQueue.put(link['href']) self.visitedPosts.append(link['href']) for i in range(cpu_count()): p = Process(target=Investigator.__posts, args=(self, inQueue, outQueue)) p.start() processes.append(p) inQueue.put('STOP') for p in processes: p.join() outQueue.put('STOP') for post in iter(outQueue.get, 'STOP'): self.listOfPosts.append(post) print "__user Elapsed Time: %s" % (time.time() - start), user except HTTPError: print 'HTTPError:', user
def __frequent_words_worker(inqueue, outqueue): for post in iter(inqueue.get, 'STOP'): try: texts = bs(urlopen(baseURL+post), parseOnlyThese = ss('div', 'content')) outqueue.put(texts.contents[0].text) print 'frequent_words ', post except: print 'frequent_words Error:', post outqueue.put(-1)
def __frequent_words_worker(inqueue, outqueue): for post in iter(inqueue.get, 'STOP'): try: texts = bs(urlopen(baseURL + post), parseOnlyThese=ss('div', 'content')) outqueue.put(texts.contents[0].text) print 'frequent_words ', post except: print 'frequent_words Error:', post outqueue.put(-1)
def __search_results(self, page): start = time.time() if page == 1: results = bs(urlopen(baseURL + queryString + self.searchTerm), parseOnlyThese = ss('a','result_primary_link')) else: results = bs(urlopen(baseURL + queryString + self.searchTerm + searchPageString + str(page)), parseOnlyThese = ss('a','result_primary_link')) for link in results.contents: if link['result-type'] == 'Talk' and not link['href'] in self.listOfPosts: Investigator.__result(self, link['href']) print "__search_results Elapsed Time: %s" % (time.time() - start), self.searchTerm, ' page: ', page
def __result(self, post): try: pageCount = 1 while True: if pageCount == 1: users = bs(urlopen(baseURL + post), parseOnlyThese=ss('div', 'user_nickname')) else: users = bs(urlopen(baseURL + post + postPageString + str(pageCount)), parseOnlyThese=ss('div', 'user_nickname')) if len(users.contents) == 1: break for user in users.contents: if user.a['href'] not in self.visitedUsers: self.visitedUsers.append(user.a['href']) Investigator.__user(self, user.a['href']) pageCount = pageCount + 1 except HTTPError: print 'HTTPError:', post
def __posts(self, inqueue, outqueue): for post in iter(inqueue.get, 'STOP'): try: texts = bs(urlopen(baseURL + post), parseOnlyThese = ss('div', 'post_content')) if len(texts.contents) > 1: if not texts.contents[0].find(text = re.compile(self.searchTerm)): for content in texts.contents[1:]: if content.find(text = re.compile(self.searchTerm)): outqueue.put(post) break except HTTPError: print 'HTTPError:', post
def __posts(self, inqueue, outqueue): for post in iter(inqueue.get, 'STOP'): try: texts = bs(urlopen(baseURL + post), parseOnlyThese=ss('div', 'post_content')) if len(texts.contents) > 1: if not texts.contents[0].find( text=re.compile(self.searchTerm)): for content in texts.contents[1:]: if content.find(text=re.compile(self.searchTerm)): outqueue.put(post) break except HTTPError: print 'HTTPError:', post
def find(self, lastvisited): new = bs(urlopen(baseURL + '/?all_pf=all-newest#all-active-posts'), parseOnlyThese=ss('a', 'post_list_post_link_url')) newPosts = [] for post in new.contents: newPosts.append(int(post['href'][7:15])) maxPost = max(newPosts) post = lastvisited while post < maxPost: relevance = Condition.__relevance_index(self, postString + str(post)) if relevance != 0: self.relevantPosts[postString + str(post)] = relevance print postString + str(post), relevance post = post + 1
def __result(self, post): try: pageCount = 1 while True: if pageCount == 1: users = bs(urlopen(baseURL + post), parseOnlyThese = ss('div', 'user_nickname')) else: users = bs(urlopen(baseURL + post + postPageString + str(pageCount)), parseOnlyThese = ss('div', 'user_nickname')) if len(users.contents) == 1: break for user in users.contents: if user.a['href'] not in self.visitedUsers: self.visitedUsers.append(user.a['href']) Investigator.__user(self, user.a['href']) pageCount = pageCount + 1 except HTTPError: print 'HTTPError:', post
def __relevance_index(self, post): try: originalPost = bs(urlopen(baseURL+post), parseOnlyThese = ss('div', 'content')) if len(originalPost.contents) != 0: words = re.findall(r'\w+', originalPost.contents[0].text) if words.__len__() > 100: weightsList = [] for word in words: if word in self.weightsDictionary: weightsList.append(self.weightsDictionary[word]) if len(weightsList) != 0: geometricMean = exp((1/float(len(weightsList)))*sum([log(x) for x in weightsList])) return geometricMean else: return 0 else: return 0 else: return 0 except HTTPError: return 0
def __relevance_index(self, post): try: originalPost = bs(urlopen(baseURL + post), parseOnlyThese=ss('div', 'content')) if len(originalPost.contents) != 0: words = re.findall(r'\w+', originalPost.contents[0].text) if words.__len__() > 100: weightsList = [] for word in words: if word in self.weightsDictionary: weightsList.append(self.weightsDictionary[word]) if len(weightsList) != 0: geometricMean = exp((1 / float(len(weightsList))) * sum([log(x) for x in weightsList])) return geometricMean else: return 0 else: return 0 else: return 0 except HTTPError: return 0
def __user(self, user): try: start = time.time() inQueue = Queue() outQueue = Queue() processes = [] links = bs(urlopen(baseURL + user + '/activity'), parseOnlyThese = ss('a', href = re.compile('/post/a.'))) for link in links.contents: if link['href'] not in self.visitedPosts: inQueue.put(link['href']) self.visitedPosts.append(link['href']) for i in range(cpu_count()): p = Process(target = Investigator.__posts, args = (self, inQueue, outQueue)) p.start() processes.append(p) inQueue.put('STOP') for p in processes: p.join() outQueue.put('STOP') for post in iter(outQueue.get, 'STOP'): self.listOfPosts.append(post) print "__user Elapsed Time: %s" % (time.time() - start), user except HTTPError: print 'HTTPError:', user
def find(self, lastvisited): new = bs(urlopen(baseURL + '/?all_pf=all-newest#all-active-posts'), parseOnlyThese = ss('a', 'post_list_post_link_url')) newPosts = [] for post in new.contents: newPosts.append(int(post['href'][7:15])) maxPost = max(newPosts) post = lastvisited while post < maxPost: relevance = Condition.__relevance_index(self, postString + str(post)) if relevance != 0: self.relevantPosts[postString + str(post)] = relevance print postString + str(post), relevance post = post + 1