def start_crawl(): global url_queue index.create_index() index.fetch_cached_urls() url_cnt = index.url_count() if url_cnt == 0: url_queue.put((4, {SEED_URL_1: {'level': 0}})) url_queue.put((3, {SEED_URL_2: {'level': 0}})) url_queue.put((2, {SEED_URL_3: {'level': 0}})) url_queue.put((1, {SEED_URL_4: {'level': 0}})) else: for i in range(0,100): rand_index = randint(0,url_cnt) url = index.get_url(rand_index) url_queue.put((-1, {url: {'level': 0}})) threads = [] for x in range(4): t = threading.Thread(target=crawl_queue) threads.append(t) for t in threads: t.start() for t in threads: t.join() return
def create_table(name, attribute, PK): catalog.init_catalog() catalog.exist_table(name, True) pidx = [x[0] for x in attribute].index(PK) if len(attribute[pidx]) != 5 or attribute[pidx][-1] != 1: raise Exception('Primary key is not a unique attribute!') catalog.create_table(name, attribute, PK) record.create_table(name) index.create_table(name, PK) for x in attribute: if PK not in x and len(x) == 5 and x[-1] == 1: index.create_index(name, 'Uni_' + x[0], x[0]) catalog.finalize()
def main(): path_read_docs_index = "/home/jessica/Documents/data/IN_104-Projet-Informatique/clean_docs/" path_save_index_folder = "/home/jessica/Documents/data/IN_104-Projet-Informatique/index/" index_name = "index_IN104" num_docs_index = 50000 number_docs_result_search = 5 lst_index_docs = glob.glob(path_read_docs_index + '*.txt') pool = Pool(8) schema = Schema(path=TEXT(stored=True), content=TEXT(analyzer=StemmingAnalyzer())) ix = create_index(path_save_index_folder, index_name, num_docs_index, lst_index_docs, schema, pool) searcher = ix.searcher() parser_query = QueryParser("content", schema=schema, group=qparser.OrGroup) user_query = "types of cancer in the human body" # Object query = Query(user_query) docs_result = query.get_query(parser_query, searcher, number_docs_result_search) for doc in docs_result: print(doc)
def create_index(new_idex, if_str_command): if catalog_manager.check_index(new_idex.table_name, new_idex.attribute_name) != 0: if (if_str_command): ret = "Index already exists!\n" return ret print("Index already exists!") return catalog_manager.create_index(new_idex) temp = record_manager.select_record_with_Index(new_idex.table_name, 0, []) cnt = catalog_manager.get_attribute_cnt(new_idex.table_name, new_idex.attribute_name) list = [] for i in temp: list.append(i[cnt]) # print(new_idex.index_id,list) index.create_index(new_idex.index_id, list)
def create_index(tname, iname, iattr): '''record.init() catalog.init_catalog() index.init_index()''' catalog.exist_index(iname, True) catalog.create_index(tname, iname, iattr) res = record.create_index(tname, catalog.get_index_of_attribute(tname, iattr), catalog.get_type_of_attribute(tname, iattr), catalog.get_length(tname)) try: index.create_index(tname, iname, res) except Exception as e: raise Exception( 'Entries sharing same key on the column that is creating index on!' ) '''index.finalize_index()
def delete_all(table_name, if_str_command): ind = catalog_manager.get_index(table_name) for i in ind: index.drop_index(i.index_id) index.create_index(i.index_id, []) record_manager.clear_table(table_name)
def create_table(dict, if_str_command): prim_index = dict['pri_index'] index.create_index(prim_index.index_id, []) table = dict['new_table'] catalog_manager.create_table(table, prim_index) record_manager.create_table(dict['table_name'])
import json from whoosh.index import open_dir from index import create_index, INDEX_NAME from schema import to_fields, SCHEMA create_index(INDEX_NAME, SCHEMA) filename = './data/relevant_Tables_working.json' index = open_dir(INDEX_NAME) writer = index.writer() with open(filename, 'r') as file: data = json.load(file) for identifier in data: print(identifier) table = data[identifier] fields = to_fields(identifier, table) writer.add_document(**fields) writer.commit()
"first_name": "Arvind", "last_name": "ds", "gender": "M", "age": "16", "type": "person" } person_node_2 = { "id": "Afdg", "first_name": "Ara", "last_name": "ds", "gender": "M", "age": "16", "type": "person" } person_edges = {"source": "afds", "destination": "afdg", "type": "friend"} ''' start_time = time() index.create_index("node_index10", "test_type2", node_properties_2, zeusdb_temp.NODE_TABLE) end_time = time() print "Create Node Index total time : " + str(end_time - start_time) ''' #start_time = time() #index.create_index("edge_index17", "test_type2", edge_properties_2, zeusdb_temp.EDGE_TABLE) #end_time = time() #print "Create Edge Index total time : " + str(end_time - start_time) #zeusdb_temp.create_node(person_node, True) #zeusdb_temp.create_node(person_node_1, True) #zeusdb_temp.create_edge(props, True) #start_time = time()
def create_index(tname, iname, iattr): catalog.init_catalog() catalog.exist_index(tname, iname, True) index.create_index(tname, iname, iattr) catalog.create_index(tname, iname, iattr) catalog.finalize()
# if answer in ['Y', 'y', 'YES', 'yes']: # es_setup.delete_cluster() # es_setup.clear_cache() ####################################### # Create Index ####################################### es = Elasticsearch(ES_URL) res = index.get_index(es) if res == False: db = db_setup.oracle_connection() db_setup.xml_cursor(db, es) index.create_index(es) ####################################### # Kibana Configuration ####################################### kibana_setup.clear_cache() kibana_setup.connect_kibana() kibana_setup.map_config() kibana_setup.map_index_pattern() kibana_setup.set_default_index_pattern() kibana_setup.set_config() ####################################### # Build Dashboards
import shutil from preprocess import main as preprocess from index import main as create_index from tfidf import main as tfidf from classifier import main as create_classifier from server import main as start_server if __name__ == '__main__': shutil.rmtree('resources/dataset/rumoureval-data/random-rumours', True) print('running preprocess') preprocess() print('creating index') create_index() print('calculating tfidfs') tfidf() print('starting server') start_server()