def table_parser(self, file_name, root): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_build_view = db.get_build_view() cursor = db_build_view._cursor # setup logging LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s' LOGGING_PATH = 'tmp/tableclasses-dbinsert.log' logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w') html_parser = WikipediaHTMLTableParser() zip_file_path = os.path.join(root, file_name) html = self.zip2html(zip_file_path) html_parser.feed(html.decode('utf-8')) source_article_id = file_name.split('_')[1] try: fed_parser = WikipediaFedTextParser(html_parser.get_data()) table_classes = fed_parser.table_classes(None) table_classes = list(set(table_classes)) for table_class in table_classes: self.insert_table_class(source_article_id, table_class, cursor) except KeyError: db_build_view._db_connection.rollback() logging.error('KeyError FedTextParser source article id: %s ' % source_article_id) db_build_view.commit() db_build_view.reset_cache()
def __init__(self, path): #os.environ["DISPLAY"]=":1" print path os.environ["DISPLAY"]=":1" db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) self.db_build_view = db.get_build_view() self.cursor = self.db_build_view._cursor self.app = QApplication(sys.argv) self.path = path
def __init__(self, path): #os.environ["DISPLAY"]=":1" print path os.environ["DISPLAY"] = ":1" db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) self.db_build_view = db.get_build_view() self.cursor = self.db_build_view._cursor self.app = QApplication(sys.argv) self.path = path
def _extract_articles(self): INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]') #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]') MAX_ARTICLES_IN_QUEUE = 200 #self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000) NUM_THREADS = 1 #self.read_number('How many threads shall be used to write to the database?', 20, 1, 50) CONTINUE = True #self.read_yes_no('This process might take several days to finish.\nDo you want to continue?') if CONTINUE: # measure time start = time.clock() # connect to database and create article queue db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE) # create reader and threads reader = WikipediaReader(INPUT_FILE, queue, extract_text=False) threads = [] for i in range(0, NUM_THREADS): inserter = ArticleInserter(queue, db.get_build_view()) threads.append(inserter) # start reader reader.start() # start insert threads for thread in threads: thread.start() # wait for reading thread, queue and inserters to be done reader.join() queue.join() for thread in threads: thread.end() for thread in threads: thread.join() seconds = round(time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60) else: print 'Aborting...'
def _extract_articles(self): INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]') #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]') MAX_ARTICLES_IN_QUEUE = 200#self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000) NUM_THREADS = 1#self.read_number('How many threads shall be used to write to the database?', 20, 1, 50) CONTINUE = True#self.read_yes_no('This process might take several days to finish.\nDo you want to continue?') if CONTINUE: # measure time start = time.clock() # connect to database and create article queue db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE) # create reader and threads reader = WikipediaReader(INPUT_FILE, queue, extract_text=False) threads = [] for i in range(0, NUM_THREADS): inserter = ArticleInserter(queue, db.get_build_view()) threads.append(inserter) # start reader reader.start() # start insert threads for thread in threads: thread.start() # wait for reading thread, queue and inserters to be done reader.join() queue.join() for thread in threads: thread.end() for thread in threads: thread.join() seconds = round (time.clock() - start) print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60) else: print 'Aborting...'