def _processing_job(documents, parameter_dict): """A single processing job on a parallel node, which processes a batch of documents. :param documents: batch of documents to process. :param parameter_dict: dataset import's parameters. :type documents: list of dicts :type parameter_dict: dict """ dataset_name = '{0}_{1}'.format(parameter_dict['texta_elastic_index'], parameter_dict['texta_elastic_mapping']) try: result_map = DocumentPreprocessor.process(documents=documents, **parameter_dict) documents = result_map['documents'] storer = DocumentStorer.get_storer(**parameter_dict) stored_documents_count = storer.store(documents) if documents: with lock: dataset_import = DatasetImport.objects.get( pk=parameter_dict['import_id']) dataset_import.processed_documents += stored_documents_count dataset_import.save() except Exception as e: HandleDatasetImportException(parameter_dict, e)
def get_features(**kwargs): try: host = kwargs.get('postgres_host', None) database = kwargs.get('postgres_database', None) port = kwargs.get('postgres_port', None) user = kwargs.get('postgres_user', None) password = kwargs.get('postgres_password', None) table_name = kwargs.get('postgres_table', None) connection_string = PostgreSQLReader.get_connection_string( host, database, port, user, password) with psycopg2.connect(connection_string) as connection: cursor = connection.cursor( cursor_factory=psycopg2.extras.DictCursor) cursor.execute("SELECT * FROM %s;" % table_name) value = cursor.fetchone() while value: yield value value = cursor.fetchone() except Exception as e: HandleDatasetImportException(kwargs, e, file_path='')
def get_features(**kwargs): directory = kwargs['directory'] for file_path in CSVReader.get_file_list(directory, 'csv'): try: with open(file_path) as csv_file: reader = csv.DictReader(csv_file) for row_idx, row in enumerate(reader): row['_texta_id'] = '{0}_{1}'.format(file_path, row_idx) yield row except Exception as e: HandleDatasetImportException(kwargs, e, file_path=file_path)
def get_features(**kwargs): directory = kwargs['directory'] for file_path in PDFReader.get_file_list(directory, 'pdf'): features = PDFReader.get_meta_features(file_path=file_path) try: features['text'] = textract.process(file_path).decode('utf8') features['_texta_id'] = file_path yield features except Exception as e: HandleDatasetImportException(kwargs, e, file_path)
def get_features(**kwargs): directory = kwargs['directory'] for file_extension in ['xls', 'xlsx']: for file_path in ExcelReader.get_file_list(directory, file_extension): try: print(file_path) book = xlrd.open_workbook(file_path) sheet = book.sheet_by_index(0) feature_labels = [ cell.value if isinstance(cell.value, str) else str(cell.value) for cell in sheet.row(0) ] feature_converters = [] for column_idx in range(sheet.ncols): col_types = list({ col_type for col_type in sheet.col_types(column_idx)[ 1:] # TODO [1:] will go out of range if col_type not in ExcelReader.empty_and_blank_codes }) col_values = sheet.col_values(column_idx) feature_converters.append( ExcelReader.get_column_converter( col_types, col_values)) for row_idx, excel_row in ( (row_idx, sheet.row(row_idx)) for row_idx in range(1, sheet.nrows)): document = { feature_labels[col_idx]: feature_converters[col_idx](cell.value) for col_idx, cell in enumerate(excel_row) } document['_texta_id'] = '{0}_{1}'.format( file_path, row_idx) yield document except Exception as e: HandleDatasetImportException(kwargs, e, file_path=file_path)
def get_features(**kwargs): directory = kwargs['directory'] for file_path in TXTReader.get_file_list(directory, 'txt'): try: features = TXTReader.get_meta_features(file_path=file_path) with open(file_path, 'r', encoding='utf8') as text_file: features['text'] = text_file.read() features['_texta_id'] = file_path yield features except Exception as e: HandleDatasetImportException(kwargs, e, file_path=file_path)
def get_features(**kwargs): directory = kwargs['directory'] file_list = CollectionReader.get_file_list( directory, 'jsonl') + CollectionReader.get_file_list( directory, 'jl') for file_path in file_list: with open(file_path, 'r', encoding='utf8') as json_file: for line in json_file: try: features = json.loads(line.strip()) yield features except Exception as e: HandleDatasetImportException(kwargs, e, file_path=file_path)
def get_features(**kwargs): select_query = kwargs['sqlite_select_query'] directory = kwargs['directory'] for file_path in SQLiteReader.get_file_list(root_directory=directory): try: with sqlite3.connect(file_path) as connection: connection.row_factory = dict_factory cursor = connection.cursor() cursor.execute(select_query) value = cursor.fetchone() while value: yield value value = cursor.fetchone() except Exception as e: HandleDatasetImportException(kwargs, e, file_path=file_path)
def get_features(**kwargs): try: raise NotImplementedError() except Exception as e: HandleDatasetImportException(kwargs, e, file_path='')