def process_file(file_path_input, file_path_output, i): if os.path.exists(file_path_output): return print('{} {} Already done'.format(now(), file_path_output)) input_file = open(file_path_input, 'r') output_file = open(file_path_output, 'a') # every worker create separate file for a input file writer = csv.writer(output_file, delimiter='\t') # for every file we create seperate driver (100 URLs) driver = webdriver.PhantomJS(executable_path=path_to_phantomjs) for line in input_file: splited = line.split('\t') # property_type = splited[0] url = splited[1] print('{} Process={} Current url: {}'.format(now(), i, url)) # start process for getting microformat properties temp_queue = Queue() # p = Process(target=get_microformat_properties_by_type, args=(url, property_type, temp_queue, i)) p = Process(target=get_element_features, args=(url, driver, temp_queue, i)) print("{} {} Process={} {} {}".format(now(), i, "Started: ", "feature extraction", url)) p.start() event_features = temp_queue.get(timeout=TIME_OUT_FEATURE) # try: # pass # except Empty: # print("{} {} Process={} {} {}".format(now(), i, "Timed out on: ", "feature extraction", url)) if p.is_alive(): p.terminate() print("Event features:" + str(event_features)) if event_features is not None: print("{} Process={} Got properties for {}".format( now(), i, url)) # start process for feature extraction and writing to separate file # p_event_features = Process(target=get_event_features_and_write, # args=(event_features, driver, writer, i, output_file)) # p_event_features = Process(target=write_element_features, args=(event_features, writer, i, output_file)) p.start() # start_with_timeout(p_event_features, TIME_OUT_LOAD, "feature writing", url, i) if p_event_features.is_alive(): p.terminate() driver.service.process.send_signal(signal.SIGTERM) driver.quit() return 'done'
def worker(file_ids, i): for file_id in file_ids: file_path_input = get_filepath(file_id) file_path_output = get_filepath(file_id, input=False) try: print('{} Process={} Started to process file {}'.format( now(), i, file_path_input)) process_file(file_path_input, file_path_output, i) except Exception as e: print('{} Process={} Bad file! {}'.format(now(), i, e)) continue
def start_with_timeout(process, timeout, msg, url, i): process.join(timeout) if process.is_alive(): print("{} {} Process={} {} {}".format(now(), i, "Timed out on: ", msg, url)) sys.stdout.flush() process.terminate()
def process_file(file_path_input, file_path_output, i): if os.path.exists(file_path_output): return print('{} {} Already done'.format(now(), file_path_output)) input_file = open(file_path_input, 'r') output_file = open(file_path_output, 'a') # every worker create separate file for a input file writer = csv.writer(output_file, delimiter='\t') # for every file we create seperate driver (100 URLs) driver = webdriver.PhantomJS(executable_path=path_to_phantomjs) for line in input_file: splited = line.split('\t') property_type = splited[0] url = splited[1] print('{} Process={} Current url: {}'.format(now(), i, url)) # start process for getting microformat properties temp_queue = Queue() p = Process(target=get_microformat_properties_by_type, args=(url, property_type, temp_queue, i)) start_with_timeout(p, TIME_OUT_LOAD, "loading", url, i) event_properties = temp_queue.get() if not temp_queue.empty() else None if p.is_alive(): p.terminate() if event_properties is not None: print("{} Process={} Got properties for {}".format( now(), i, url)) # start process for feature extraction and writing to separate file p_event_features = Process(target=get_event_features_and_write, args=(event_properties, driver, writer, i, output_file)) start_with_timeout(p_event_features, TIME_OUT_FEATURE, "feature extraction", url, i) if p_event_features.is_alive(): p.terminate() return 'done'
def process_url(driver, url, i): output_filename = "{}/{}_{}.csv".format(PATH_PARSED_FILES, 'all_elements', i) if os.path.exists(output_filename): print('{} File already exists {}'.format(now(), output_filename)) return try: driver.get(url) except: print('{} The problem with url: {}'.format(now(), url)) return time.sleep(2) output_file = open(output_filename, 'a') writer = csv.writer(output_file, delimiter='\t') element_features = get_element_features(url, driver) write_element_features(element_features, writer, output_file)
def get_element_features(url, driver): print('{} Getting all element features for {}'.format(now(), url)) driver.get(url) elements = driver.find_elements_by_xpath( "//*[not(contains(@style,'display:none')) and normalize-space(text())]" ) element_features = [] for element in elements: try: if element.tag_name in GOOD_TAGS and element.text != '': element_features.append(ElementFeature(element, url, driver)) except Exception as e: print(e) return element_features
def write_element_features(event_features, writer, output_file): print("{} {}".format(now(), "Writing features")) for element_feature in event_features: row_1_part = [ element_feature.url, 'not_event_element', element_feature.text_property, element_feature.xy_coords['x'], element_feature.xy_coords['y'], element_feature.block_size['height'], element_feature.block_size['width'], element_feature.tag, 'NaN', element_feature.num_siblings ] css_prop = element_feature.css_prop row_2_part = [css_prop.get(css_h, None) for css_h in css_header] row = row_1_part + row_2_part writer.writerow([str(s, "utf-8") for s in row]) output_file.flush()