def create_graph(): graph = nx.DiGraph() dev_directory = "DEV/" graph.add_nodes_from(range(1, len(mapping)+1)) for direct in os.listdir(dev_directory): if direct != '.DS_Store': for file in os.listdir(dev_directory + direct): # posting(file) returns url and stuff for DOC_ID_DICT and for tokenizer temp = Url(dev_directory + direct + '/' + file) if temp.get_url() in mapping: list_of_links = get_all_sublinks(temp.get_html(), temp.get_url()) graph.add_edges_from([(mapping[temp.get_url()], final_edge)for final_edge in list_of_links]) if mapping[temp.get_url()] == 34480 or mapping[temp.get_url()] == 34479: return graph
def get_all_files(dev_directory): file_count_name = 'indexes2/inverted_index_' file_count = 0 file_count_name_count = 1 doc_id = 1 inverted_index = defaultdict(list) reader = Html_Reader() for direct in os.listdir(dev_directory): if direct != '.DS_Store': for file in os.listdir(dev_directory + direct): # posting(file) returns url and stuff for DOC_ID_DICT and for tokenizer temp = Url(dev_directory + direct + '/' + file) try: read = reader.read_file(temp.get_html(), doc_id, inverted_index) if read: # write to a file the current inverted index, if it is above a certain file count file_count += 1 DOC_ID_DICT[doc_id] = temp.get_url() doc_id += 1 except Exception as e: with open('error.txt', 'w+') as error_file: error_file.write(str(e) + str(temp.get_url()) + "\n") if file_count == 1000: write_to_index(inverted_index, file_count_name_count, file_count_name) inverted_index = defaultdict(list) elif file_count > 1000: file_count_name_count += 1 file_count = 0 # adds the current dict to a file for a partial index # change file_count_name also to write to a different file write_to_index(inverted_index, file_count_name_count, file_count_name) write_doc_ids(DOC_ID_DICT)