def endElement(self, tag): if tag == "id" and self.idFlag == 1: self.idFlag = 0 elif tag == "title": config.id_title_map[config.page_count] = self.bufferTitle print("pg", config.page_count) process_title(self.bufferTitle) elif tag == "text": process_text(self.bufferText) create_index()
def crawl_and_index(base_urls, con, depth=2): indexed_urls = set() for i in range(depth): for base_url in base_urls: indexed_urls.add(base_url) links = get_valid_links(base_url) for link in links: if not link in indexed_urls: create_index(link, con) indexed_urls.add(link) return indexed_urls
def main(): listFolder = os.listdir("DEV") # gets list of sub folder names in DEV/ urlNum = dict() i = 0 for folder in listFolder: files = [ f for f in listdir("DEV/" + folder) if isfile(join("DEV/" + folder, f)) ] # every file in var folder index = create_index(files, folder) # one index per folder in DEV filename = "indexer" + str(i) + ".txt" fileObject = open(filename, 'wb') # create partial index per subfolder pickle.dump(index.getDict(), fileObject) # dump the dictionary of the index into a file fileObject.close() urlNum[folder] = 0 # Number of urls from each folder (Just testing) i += 1 pickle.dump(urlDict, open('docID_to_url.txt', 'wb'))
import json import math from tqdm import tqdm f = open('config.json') index_config = json.load(f) if not indexer.elastic_search_is_running(index_config): print('Elastic search is not responding on ' + index_config['host'] + ':' + index_config['port']) else: indexer.delete_index(index_config) if not indexer.index_exists(index_config): print('Index \"' + index_config['index'] + '\" was not found') print('Creating index \"' + index_config['index'] + '\"') indexer.create_index(index_config) repos = find_sources.load_cache()[0:500] for data in tqdm(repos): repo = data["url"] score = data["score"] target_dir = repository.clone_repo(repo) for file in repository.java_files(target_dir): # print("Indexing file " + file) try: methods = parse.find_methods(open(file).read()) except javalang.parser.JavaSyntaxError: # print("Skipping {}, couldn't parse it".format(file))
from indexer import connect_elasticsearch, create_index, store_record, search import logging import json property_index = 'wd_property' entity_index = 'wd_entity' property_doc_type = 'members' with open('data/dump/property.json') as f: prop_datas = json.load(f) prop_datas logging.basicConfig(level=logging.ERROR) es = connect_elasticsearch() if (create_index(es, property_index) and create_index(es, entity_index)): count = 0 for elem in prop_datas: prop_data = json.dumps(elem) if (not store_record(es, property_index, prop_data)): break else: count = count + 1 if (count % 100 == 0): print("nums of data stored: {}".format(count)) print("finish") print("total data stored: {}".format(count))