def query_to_vector(raw_query, word2col): # create empty query vector query_vector = np.zeros(len(word2col)) # tokenize query query_tokens = text_processing.plain_text_to_tokens(raw_query) # , stopwords file) # update term frequencies of query vector for token in query_tokens: column_index = word2col[token] query_vector[column_index] += 1 return query_vector
def query_to_vector(self, raw_query): # create empty query vector query_vector = np.zeros(len(self.word2col)) # tokenize query query_tokens = text_processing.plain_text_to_tokens( raw_query) # , stopwords file) # update term frequencies of query vector for token in query_tokens: try: column_index = self.word2col[token] query_vector[column_index] += 1 except KeyError: logger.info("Query word not found in index: %s (stemmed)" % token) return query_vector
def query_to_vector_slow(raw_query): # all that is needed is word2col dictonary word2col_file_path = file_io.get_path('word2col_file_path', None) with open(word2col_file_path) as json_data: word2col = json.load(json_data) # create empty query vector query_vector = np.zeros(len(word2col)) # tokenize query query_tokens = text_processing.plain_text_to_tokens(raw_query) # , stopwords file) # update term frequencies of query vector for token in query_tokens: column_index = word2col[token] query_vector[column_index] += 1 return query_vector
def load_title_document_id_term_frequency_dictionaries(indexed_directory_name): logger.info("Loading Title Frequency Dictionaries") document_title_dictionary_file_template = \ file_io.get_template('document_title_file_path') % (indexed_directory_name, '*') document_id_term_frequency_dictionary = {} for dtd_path in glob.glob(document_title_dictionary_file_template): with open(dtd_path) as json_data: dtd = json.load(json_data) doc_id = str(dtd['document_id']) doc_title = dtd['title'] # No title if doc_title is None: doc_title = 'NO_TITLE' title_tokens = text_processing.plain_text_to_tokens( doc_title) # ,stopwords_file) doc_term_freq_dict = text_processing.word_frequency_dict( title_tokens) document_id_term_frequency_dictionary[doc_id] = doc_term_freq_dict return document_id_term_frequency_dictionary
def pull_summary(requested_url, included_attributes=("requested_url", "redirect_history", "status_code", "content_type","content_hash", "normalized_a_hrefs", 'normalized_img_srcs')): """ access a given url and return a python dictionary of page data. """ response_summary = { 'requested_url': requested_url, 'status_code' : 404 } try: # make request response = requests.get(requested_url) # set "status_code" value response_summary['status_code'] = response.status_code # log status code logger.info("Response Status Code: %d" % response.status_code) # continue if status is 200 if response.status_code == 200: # set 'content_hash' value response_summary['content_hash'] = str(hashlib.md5(str.encode(response.text)).hexdigest()) # set 'redirect_history' value response_summary['redirect_history'] = [] for redirect in response.history: response_summary['redirect_history'].append((redirect.url, redirect.status)) # set 'content_type' value if 'content-type' in response.headers: response_summary['content_type'] = response.headers['content-type'] # set 'binary_response_content' value if 'binary_response_content' in included_attributes: response_summary['binary_response_content'] = response.content # set 'plain_text' value if 'plain_text' in included_attributes: response_summary['plain_text'] = None if response_summary['content_type'] in file_parser.acepted_content_types(): response_summary['plain_text'] = file_parser.extract_plain_text(response.text, response_summary['content_type']) # set 'tokens' value if 'tokens' in included_attributes: if response_summary['content_type'].split(';')[0] in file_parser.acepted_content_types(): plain_text = file_parser.extract_plain_text(response.text, response_summary['content_type']) response_summary['tokens'] = text_processing.plain_text_to_tokens(plain_text) if 'term_frequency_dict' in included_attributes: if response_summary['content_type'].split(';')[0] in file_parser.acepted_content_types(): plain_text = file_parser.extract_plain_text(response.text, response_summary['content_type']) tokens = text_processing.plain_text_to_tokens(plain_text) response_summary['term_frequency_dict'] = text_processing.word_frequency_dict(tokens) # if type "text/html" - read links if ('normalized_a_hrefs' or 'normalized_img_srcs') in included_attributes: if response.headers['content-type'][:9] == "text/html": # Note: base_url is requested_url # set 'normalized_a_hrefs' response_summary['normalized_a_hrefs'] = normalize_urls(requested_url, file_parser.extract_a_hrefs_list(response.text)) # set 'normalized_img_srcs' response_summary['normalized_img_srcs'] = normalize_urls(requested_url, file_parser.extract_img_srcs_list(response.text)) except: logger.error("Requested Page: %s, Failed to read." % response_summary['requested_url']) logger.error(sys.exc_info()) # filter attributes not in included_attributes tuple parameter response_summary = {k: v for k, v in response_summary.items() if k in included_attributes} return response_summary
def crawl_site(self, seed_url, output_directory_name, max_urls_to_index=None, stopwords_file=None): self.output_directory_name = output_directory_name self.max_urls_to_index = max_urls_to_index self.stopwords_file = stopwords_file # resolve seed url self.seed_url = self.url_resolver.resolve(seed_url) # set forbidden_urls self.read_robots() # add seed url to url frontier self.url_frontier.add(self.seed_url) # log info logger.info("Beginning Site Crawl: %s" % self.seed_url) if self.max_urls_to_index is not None: logger.info("Number of Sites to Index: %d" % self.max_urls_to_index) else: logger.info("Index Forever") # begin crawl while self.continue_indexing(): # retrieve url to index target_url = self.url_frontier.remove() # ensure it is resolved target_url = self.url_resolver.resolve(target_url) # add it to url_id_map (index) self.url_id_map.add(target_url) # log info logger.info("Crawling URL Number: %d" % self.url_id_map[target_url]) logger.info("Crawling URL: %s" % target_url) # access site and get response summary response_summary = url_accessor.get_response_summary( target_url, self.url_resolver) if not response_summary['broken']: # save response summary (add id , remove binary_response_content) written_response_summary = { k: v for k, v in response_summary.items() if k != 'binary_response_content' } written_response_summary['url_id'] = self.url_id_map[ target_url] response_summary_directory = self.directory_structure_dict['path_templates'][ 'response_summaries_directory_path_template'] \ % (self.output_directory_name) # if response_summary_directory does not exist, create it if not os.path.exists(response_summary_directory): os.makedirs(response_summary_directory) response_summary_file_path = self.directory_structure_dict['path_templates']['response_summaries_file_path_template'] \ % (self.output_directory_name, written_response_summary['url_id']) # write response summary file with open(response_summary_file_path, 'w') as file: file.write(json.dumps(written_response_summary)) if response_summary[ 'document_hash'] not in self.indexed_document_hashes: # extract and tokenize plain text, then save to document file plain_text = self.file_parser.extract_plain_text( response_summary['binary_response_content'], response_summary['content_type']) tokens = text_processing.plain_text_to_tokens( plain_text, self.stopwords_file) # write tokens to document file document_directory = self.directory_structure_dict['path_templates'][ 'document_directory_path_template'] \ % (self.output_directory_name, response_summary['document_hash']) # if document_directory does not exist, create it if not os.path.exists(document_directory): os.makedirs(document_directory) document_tokens_file_path = self.directory_structure_dict['path_templates'][ 'document_tokens_file_path_template'] \ % (self.output_directory_name, response_summary['document_hash']) with open(document_tokens_file_path, 'w') as file: file.write(json.dumps(tokens)) # update document hash index if document has not been seen self.indexed_document_hashes.add( response_summary['document_hash']) # Add New Urls To Queue And Continue Crawling #self.url_frontier.add_list(self.filter_urls(response_summary['resolved_normalized_a_hrefs'])) for filtered_url in self.filter_urls( response_summary['resolved_normalized_a_hrefs']): self.url_frontier.add(filtered_url) print("Before Filter") print(response_summary['resolved_normalized_a_hrefs']) print("After Filter") print( self.filter_urls( response_summary['resolved_normalized_a_hrefs'])) print("Queue") print(self.url_frontier.to_list())