Пример #1
0
 def read_batch(self, docid_to_path=None, start=0, limit=0) -> dict:
     if docid_to_path is None:
         docid_to_path = list()
     
     partial_index = defaultdict(lambda: list())
     ps = PorterStemmer()
     
     if (start + limit) < len(docid_to_path):
         end = start + limit
     else:
         end = len(docid_to_path)
     
     tag_list = {'title': 100, 'h1': 90, 'h2': 80, 'h3': 70, 'h4': \
         60, 'h5': 50, 'h6': 40, 'strong': 30,
                 'b': 20, 'a': 10, 'p': 1, 'span': 1, 'div': 1}
     
     for i in range(start, end):
         with open(docid_to_path[str(i)], "r") as file:
             json_data = json.load(file)
             content = json_data["content"]
         
         soup = BeautifulSoup(content, features="html.parser")
         
         word_frequency = defaultdict(lambda: int())
         
         # Different tags has different importance level
         for tag in tag_list:
             tag_content_list = soup.find_all(tag)
             for tag_content in tag_content_list:
                 text = tag_content.get_text()
                 tokens = [ps.stem(token) for token in tokenize(text)]
                 temp_frequency = compute_word_frequencies(tokens)
                 for item in temp_frequency.items():
                     if len(item[0]) > 30:
                         continue
                     word_frequency[item[0]] += tag_list[tag] * item[1]
         
         # tokens = [ps.stem(token) for token in tokenize(soup.get_text())]
         # word_frequency = compute_word_frequencies(tokens)
         
         # do similarity test here, uncompleted
         
         # ------------------------
         
         for k, v in word_frequency.items():
             partial_index[k].append(Posting(i, v))
     
     self.write_partial_index(partial_index)
     
     with open(str(self.log_dir / "status.json"), "r") as file:
         status = json.load(file)
         status["read_batches"] += 1
         status["partial_index"] += 1
     
     with open(str(self.log_dir / "status.json"), "w") as file:
         json.dump(status, file)
     
     return partial_index
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("filepath1", help="path of the file to compare")
    parser.add_argument("filepath2", help="path of the file to compare")
    parser.add_argument("print_common", help="True/T to display common keys")
    args = parser.parse_args()
    filepath_1 = args.filepath1
    filepath_2 = args.filepath2
    print_common_flag = args.print_common
    tokens_1 = tokenize(filepath_1)
    tokens_2 = tokenize(filepath_2)
    word_frequencies_1 = compute_word_frequencies(tokens_1)
    word_frequencies_2 = compute_word_frequencies(tokens_2)
    if print_common_flag in ['True', 'true', 'T', 't']:
        count = count_common(word_frequencies_1, word_frequencies_2, True)
    else:
        count = count_common(word_frequencies_1, word_frequencies_2)
    print(count)
Пример #3
0
def similarity_check(document):
    curr_text = ex.html_to_text(document)
    curr_tokens = tokenizer.tokenize(curr_text)
    curr_freq = tokenizer.compute_word_frequencies(curr_tokens)

    curr_freq = tokenizer.remove_stop_words(curr_freq)

    intersections = tokenizer.find_intersections(curr_freq, my_pages.get_last_tokens_freq())

    numerator = intersections
    denominator = max(max(len(my_pages.get_last_tokens_freq()), len(curr_freq)), 1)
    similarity = numerator / denominator

    print(numerator, '/', denominator, '=', similarity)

    my_pages.set_last_tokens_freq(curr_freq)
    return similarity, curr_tokens
Пример #4
0
def print_data():
    f = open("HW2output.txt", "w+")
    unique_pages = my_pages.get_all_links_visited()

    # Question 1: How many unique pages did you find? Uniqueness is established by the URL, but discarding the fragment
    # part. So, for example, http://www.ics.uci.edu#aaa and http://www.ics.uci.edu#bbb are the same URL.
    # print("Number Of Unique Pages:",len(unique_pages))
    f.write("Number Of Unique Pages: {0}\n".format(len(unique_pages)))

    # f.write(str("Number Of Unique Pages:", len(unique_pages)))
    largest_page, number_of_tokens = my_pages.get_largest_page()

    # Question 2: What is the longest page in terms of number of words? (HTML markup doesn’t count as words)
    # print("Longest Page:", largest_page, '\t Number Of Words:', number_of_tokens)
    f.write("\nLongest Page: {0} \t Number Of Words: {1}\n".format(largest_page, number_of_tokens))

    # Question 3: What are the 50 most common words in the entire set of pages? (Ignore English stop words, which can
    # be found, for example, here (Links to an external site.)) Submit the list of common words ordered by frequency.
    # print("Fifty Most Common Words")

    all_tokens = my_pages.get_all_tokens()
    all_tokens_freq = tokenizer.compute_word_frequencies(all_tokens)

    fifty_most_common = tokenizer.get_50_most_common_words(all_tokens_freq)

    f.write("\nFifty Most Common Word, freq\n")
    for word, number in fifty_most_common:
        f.write("{0}, {1}\n".format(word, number))
    # print(tokenizer.get_50_most_common_words(all_tokens_freq))

    # Question 4: How many subdomains did you find in the ics.uci.edu domain? Submit the list of subdomains ordered
    # alphabetically and the number of unique su detected in each subdomain. The content of this list should be lines
    # containing URL, number, for example: http://vision.ics.uci.edu, 10 (not the actual number here)
    # print(tokenizer.)
    # print(tokenizer.sort_alpha(my_pages.get_ics_sub_domains()))
    f.write("\nSubdomain(URL), freq(number)\n")
    alpha = tokenizer.sort_alpha(my_pages.get_ics_sub_domains())
    for url, number in alpha:
        f.write("{0}, {1}\n".format(url, number))
Пример #5
0
    def _add_document_to_index(self, file):
        # get info from document
        json_dict = get_json_from_file(file)
        url = get_url_from_json(json_dict)

        # skip this document if the url is not valid
        if not url_is_valid(url):
            return

        # add the document to the index
        html_content = get_html_content_from_json(json_dict)
        content_tokens, strong_tokens, title_tokens, h1_tokens, h2_tokens, h3_tokens, bold_tokens = tokenize(
            html_content)
        token_freqs = compute_word_frequencies(content_tokens, strong_tokens,
                                               title_tokens, h1_tokens,
                                               h2_tokens, h3_tokens,
                                               bold_tokens)
        for token, frequency in token_freqs.items():
            self._inverted_index[token].append((self._doc_id, frequency))

        # add the url and document id mapping to the doc map
        self._doc_id_map[self._doc_id] = url
        self._doc_id += 1
Пример #6
0
def extract_next_links(url: str, resp: Response, state):
    if not resp.raw_response:
        return []
    print('extracting', url)
    output_links = []
    resp.url = stripTrailingSlash(resp.url)
    try:

        doc = html.document_fromstring(resp.raw_response.content)
        doc.make_links_absolute(
            resp.final_url if resp.is_redirected else resp.url)

        urls = [urlunparse(urlparse(i[2])) for i in doc.iterlinks()]
        urls = set(imap_multiple(urls, stripTrailingSlash, removeFragment))
        try:
            # content = h.handle(resp.raw_response.content.decode("utf-8", "ignore"))
            content = text_from_html(resp.raw_response.content)
            tokens = tokenize(content)
            word_count = len(tokens)
            words = [word for word in tokens if word not in stop_words]
            freqs = compute_word_frequencies(words)
        except:
            logger.error(traceback.format_exc())
            freqs = {}
            word_count = 0

        parsed_url = urlparse(url)
        sub_domain = f'{parsed_url.scheme}://{parsed_url.hostname}'

        with state_lock:
            # Count the number of output_links on the page
            counts = state['counts']
            counts[resp.url].outlink_count += len(urls)
            counts[resp.url].download_count += 1

            # Count the common words
            word_rank = state['word_rank']
            for word, freq in freqs.items():
                word_rank[word] += freq

            if len(word_rank) > 5000:
                new_word_rank = defaultdict(int)
                new_word_rank.update(
                    dict(
                        list(sorted(word_rank.items(),
                                    key=lambda x: -x[1]))[:3000]))
                del word_rank
                word_rank = new_word_rank
            state['word_rank'] = word_rank

            # Count the sub_domain
            sub_domains = state['sub_domains']
            sub_domains[sub_domain] += 1
            state['sub_domains'] = sub_domains

            # Count the longest page
            max_count, pages = state['longest_page']
            if word_count == max_count:
                pages.append(url)
            elif word_count > max_count:
                pages = [url]
                max_count = word_count
            state['longest_page'] = (max_count, pages)

            # Define url patterns to match and it's max count
            patterns = OrderedDict()
            patterns['news/view_news(php)?'] = 50
            patterns['calendar.ics.uci.edu/calendar.php'] = 0
            patterns['ganglia.ics.uci.edu'] = 0
            patterns['.*'] = -1  # Any number of occurrence

            filters = [
                isHttpOrHttps,
                isInDomain([
                    "ics.uci.edu", ".cs.uci.edu", ".informatics.uci.edu",
                    ".stat.uci.edu",
                    "today.uci.edu/department/information_computer_sciences/"
                ]), isNotAsset,
                queryCount(300, counts),
                patternCount(patterns, state),
                linkCount(lambda x: x < 1, counts)
            ]

            output_links = list(applyFilters(filters, urls))
            state['counts'] = counts
            state.sync()

    except Exception:

        logger.error(traceback.format_exc())

    return output_links