def read_batch(self, docid_to_path=None, start=0, limit=0) -> dict: if docid_to_path is None: docid_to_path = list() partial_index = defaultdict(lambda: list()) ps = PorterStemmer() if (start + limit) < len(docid_to_path): end = start + limit else: end = len(docid_to_path) tag_list = {'title': 100, 'h1': 90, 'h2': 80, 'h3': 70, 'h4': \ 60, 'h5': 50, 'h6': 40, 'strong': 30, 'b': 20, 'a': 10, 'p': 1, 'span': 1, 'div': 1} for i in range(start, end): with open(docid_to_path[str(i)], "r") as file: json_data = json.load(file) content = json_data["content"] soup = BeautifulSoup(content, features="html.parser") word_frequency = defaultdict(lambda: int()) # Different tags has different importance level for tag in tag_list: tag_content_list = soup.find_all(tag) for tag_content in tag_content_list: text = tag_content.get_text() tokens = [ps.stem(token) for token in tokenize(text)] temp_frequency = compute_word_frequencies(tokens) for item in temp_frequency.items(): if len(item[0]) > 30: continue word_frequency[item[0]] += tag_list[tag] * item[1] # tokens = [ps.stem(token) for token in tokenize(soup.get_text())] # word_frequency = compute_word_frequencies(tokens) # do similarity test here, uncompleted # ------------------------ for k, v in word_frequency.items(): partial_index[k].append(Posting(i, v)) self.write_partial_index(partial_index) with open(str(self.log_dir / "status.json"), "r") as file: status = json.load(file) status["read_batches"] += 1 status["partial_index"] += 1 with open(str(self.log_dir / "status.json"), "w") as file: json.dump(status, file) return partial_index
def main(): parser = argparse.ArgumentParser() parser.add_argument("filepath1", help="path of the file to compare") parser.add_argument("filepath2", help="path of the file to compare") parser.add_argument("print_common", help="True/T to display common keys") args = parser.parse_args() filepath_1 = args.filepath1 filepath_2 = args.filepath2 print_common_flag = args.print_common tokens_1 = tokenize(filepath_1) tokens_2 = tokenize(filepath_2) word_frequencies_1 = compute_word_frequencies(tokens_1) word_frequencies_2 = compute_word_frequencies(tokens_2) if print_common_flag in ['True', 'true', 'T', 't']: count = count_common(word_frequencies_1, word_frequencies_2, True) else: count = count_common(word_frequencies_1, word_frequencies_2) print(count)
def similarity_check(document): curr_text = ex.html_to_text(document) curr_tokens = tokenizer.tokenize(curr_text) curr_freq = tokenizer.compute_word_frequencies(curr_tokens) curr_freq = tokenizer.remove_stop_words(curr_freq) intersections = tokenizer.find_intersections(curr_freq, my_pages.get_last_tokens_freq()) numerator = intersections denominator = max(max(len(my_pages.get_last_tokens_freq()), len(curr_freq)), 1) similarity = numerator / denominator print(numerator, '/', denominator, '=', similarity) my_pages.set_last_tokens_freq(curr_freq) return similarity, curr_tokens
def print_data(): f = open("HW2output.txt", "w+") unique_pages = my_pages.get_all_links_visited() # Question 1: How many unique pages did you find? Uniqueness is established by the URL, but discarding the fragment # part. So, for example, http://www.ics.uci.edu#aaa and http://www.ics.uci.edu#bbb are the same URL. # print("Number Of Unique Pages:",len(unique_pages)) f.write("Number Of Unique Pages: {0}\n".format(len(unique_pages))) # f.write(str("Number Of Unique Pages:", len(unique_pages))) largest_page, number_of_tokens = my_pages.get_largest_page() # Question 2: What is the longest page in terms of number of words? (HTML markup doesn’t count as words) # print("Longest Page:", largest_page, '\t Number Of Words:', number_of_tokens) f.write("\nLongest Page: {0} \t Number Of Words: {1}\n".format(largest_page, number_of_tokens)) # Question 3: What are the 50 most common words in the entire set of pages? (Ignore English stop words, which can # be found, for example, here (Links to an external site.)) Submit the list of common words ordered by frequency. # print("Fifty Most Common Words") all_tokens = my_pages.get_all_tokens() all_tokens_freq = tokenizer.compute_word_frequencies(all_tokens) fifty_most_common = tokenizer.get_50_most_common_words(all_tokens_freq) f.write("\nFifty Most Common Word, freq\n") for word, number in fifty_most_common: f.write("{0}, {1}\n".format(word, number)) # print(tokenizer.get_50_most_common_words(all_tokens_freq)) # Question 4: How many subdomains did you find in the ics.uci.edu domain? Submit the list of subdomains ordered # alphabetically and the number of unique su detected in each subdomain. The content of this list should be lines # containing URL, number, for example: http://vision.ics.uci.edu, 10 (not the actual number here) # print(tokenizer.) # print(tokenizer.sort_alpha(my_pages.get_ics_sub_domains())) f.write("\nSubdomain(URL), freq(number)\n") alpha = tokenizer.sort_alpha(my_pages.get_ics_sub_domains()) for url, number in alpha: f.write("{0}, {1}\n".format(url, number))
def _add_document_to_index(self, file): # get info from document json_dict = get_json_from_file(file) url = get_url_from_json(json_dict) # skip this document if the url is not valid if not url_is_valid(url): return # add the document to the index html_content = get_html_content_from_json(json_dict) content_tokens, strong_tokens, title_tokens, h1_tokens, h2_tokens, h3_tokens, bold_tokens = tokenize( html_content) token_freqs = compute_word_frequencies(content_tokens, strong_tokens, title_tokens, h1_tokens, h2_tokens, h3_tokens, bold_tokens) for token, frequency in token_freqs.items(): self._inverted_index[token].append((self._doc_id, frequency)) # add the url and document id mapping to the doc map self._doc_id_map[self._doc_id] = url self._doc_id += 1
def extract_next_links(url: str, resp: Response, state): if not resp.raw_response: return [] print('extracting', url) output_links = [] resp.url = stripTrailingSlash(resp.url) try: doc = html.document_fromstring(resp.raw_response.content) doc.make_links_absolute( resp.final_url if resp.is_redirected else resp.url) urls = [urlunparse(urlparse(i[2])) for i in doc.iterlinks()] urls = set(imap_multiple(urls, stripTrailingSlash, removeFragment)) try: # content = h.handle(resp.raw_response.content.decode("utf-8", "ignore")) content = text_from_html(resp.raw_response.content) tokens = tokenize(content) word_count = len(tokens) words = [word for word in tokens if word not in stop_words] freqs = compute_word_frequencies(words) except: logger.error(traceback.format_exc()) freqs = {} word_count = 0 parsed_url = urlparse(url) sub_domain = f'{parsed_url.scheme}://{parsed_url.hostname}' with state_lock: # Count the number of output_links on the page counts = state['counts'] counts[resp.url].outlink_count += len(urls) counts[resp.url].download_count += 1 # Count the common words word_rank = state['word_rank'] for word, freq in freqs.items(): word_rank[word] += freq if len(word_rank) > 5000: new_word_rank = defaultdict(int) new_word_rank.update( dict( list(sorted(word_rank.items(), key=lambda x: -x[1]))[:3000])) del word_rank word_rank = new_word_rank state['word_rank'] = word_rank # Count the sub_domain sub_domains = state['sub_domains'] sub_domains[sub_domain] += 1 state['sub_domains'] = sub_domains # Count the longest page max_count, pages = state['longest_page'] if word_count == max_count: pages.append(url) elif word_count > max_count: pages = [url] max_count = word_count state['longest_page'] = (max_count, pages) # Define url patterns to match and it's max count patterns = OrderedDict() patterns['news/view_news(php)?'] = 50 patterns['calendar.ics.uci.edu/calendar.php'] = 0 patterns['ganglia.ics.uci.edu'] = 0 patterns['.*'] = -1 # Any number of occurrence filters = [ isHttpOrHttps, isInDomain([ "ics.uci.edu", ".cs.uci.edu", ".informatics.uci.edu", ".stat.uci.edu", "today.uci.edu/department/information_computer_sciences/" ]), isNotAsset, queryCount(300, counts), patternCount(patterns, state), linkCount(lambda x: x < 1, counts) ] output_links = list(applyFilters(filters, urls)) state['counts'] = counts state.sync() except Exception: logger.error(traceback.format_exc()) return output_links