result_list.append(sentence) index = bi.current(); if __name__=='__main__': # Получает задание path_to_targets = 'spider-targets/' target_file = path_to_targets+'_test_task.txt' list_targets = read_utf_txt_file(target_file) #map(printer, list_targets) # Каждый файл отдельно # index_file = 0 for txt_fname in list_targets: file_content_in_list = read_utf_txt_file(txt_fname) meta = file_content_in_list[0] content = file_content_in_list[1:] # Пишем заголовок result =[meta, ''] # Токенизируем контент split_to_sentents(content, result) # Записываем результаты path_to_tmp_files = 'result_folder/' write_result_file(result, path_to_tmp_files+'tmp'+str(index_file)+'.txt') index_file += 1 print 'Done'
# App from to_text.tika_wrapper import TextExtractorFromOdtDocPdf from to_text.tika_wrapper import write_result_file if __name__=='__main__': fname = "d:/11832_.pdf" fname = "d:/t.doc" fname = "d:/letter.pdf" ofile = 'u8_3_txt.txt' fname = 'tests_data/t.pdf' #class path_to_node1 = 'D:/home/lugansky-igor/doc_pdf_odt/' node_name = 'Test Node' tmp_file_root = 'tmp_folder' path = File(path_to_node1) tasks_for_spider_purger = []#'['+node_name+']'] for fname in path.list(): full_name = path_to_node1+fname extractor = TextExtractorFromOdtDocPdf() tmp_fname, err_code, err_msg = extractor.process(full_name, tmp_file_root) tasks_for_spider_purger.append(tmp_fname[0]) path_to_targets = 'spider-targets/' write_result_file(tasks_for_spider_purger, path_to_targets+'_test_task.txt') print print 'Done'