示例#1
0
文件: importer.py 项目: cbentes/texta
def _processing_job(documents, parameter_dict):
    """A single processing job on a parallel node, which processes a batch of documents.

    :param documents: batch of documents to process.
    :param parameter_dict: dataset import's parameters.
    :type documents: list of dicts
    :type parameter_dict: dict
    """
    dataset_name = '{0}_{1}'.format(parameter_dict['texta_elastic_index'],
                                    parameter_dict['texta_elastic_mapping'])

    try:
        result_map = DocumentPreprocessor.process(documents=documents,
                                                  **parameter_dict)
        documents = result_map['documents']

        storer = DocumentStorer.get_storer(**parameter_dict)
        stored_documents_count = storer.store(documents)

        if documents:
            with lock:
                dataset_import = DatasetImport.objects.get(
                    pk=parameter_dict['import_id'])
                dataset_import.processed_documents += stored_documents_count
                dataset_import.save()

    except Exception as e:
        HandleDatasetImportException(parameter_dict, e)
示例#2
0
    def get_features(**kwargs):

        try:
            host = kwargs.get('postgres_host', None)
            database = kwargs.get('postgres_database', None)
            port = kwargs.get('postgres_port', None)
            user = kwargs.get('postgres_user', None)
            password = kwargs.get('postgres_password', None)
            table_name = kwargs.get('postgres_table', None)

            connection_string = PostgreSQLReader.get_connection_string(
                host, database, port, user, password)

            with psycopg2.connect(connection_string) as connection:
                cursor = connection.cursor(
                    cursor_factory=psycopg2.extras.DictCursor)
                cursor.execute("SELECT * FROM %s;" % table_name)

                value = cursor.fetchone()
                while value:
                    yield value
                    value = cursor.fetchone()

        except Exception as e:
            HandleDatasetImportException(kwargs, e, file_path='')
示例#3
0
    def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in CSVReader.get_file_list(directory, 'csv'):
            try:
                with open(file_path) as csv_file:
                    reader = csv.DictReader(csv_file)
                    for row_idx, row in enumerate(reader):
                        row['_texta_id'] = '{0}_{1}'.format(file_path, row_idx)
                        yield row

            except Exception as e:
                HandleDatasetImportException(kwargs, e, file_path=file_path)
示例#4
0
    def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in PDFReader.get_file_list(directory, 'pdf'):
            features = PDFReader.get_meta_features(file_path=file_path)

            try:
                features['text'] = textract.process(file_path).decode('utf8')
                features['_texta_id'] = file_path

                yield features
            except Exception as e:
                HandleDatasetImportException(kwargs, e, file_path)
示例#5
0
    def get_features(**kwargs):
        directory = kwargs['directory']

        for file_extension in ['xls', 'xlsx']:
            for file_path in ExcelReader.get_file_list(directory,
                                                       file_extension):

                try:
                    print(file_path)
                    book = xlrd.open_workbook(file_path)
                    sheet = book.sheet_by_index(0)

                    feature_labels = [
                        cell.value
                        if isinstance(cell.value, str) else str(cell.value)
                        for cell in sheet.row(0)
                    ]
                    feature_converters = []

                    for column_idx in range(sheet.ncols):
                        col_types = list({
                            col_type
                            for col_type in sheet.col_types(column_idx)[
                                1:]  # TODO [1:] will go out of range
                            if col_type not in
                            ExcelReader.empty_and_blank_codes
                        })
                        col_values = sheet.col_values(column_idx)
                        feature_converters.append(
                            ExcelReader.get_column_converter(
                                col_types, col_values))

                    for row_idx, excel_row in (
                        (row_idx, sheet.row(row_idx))
                            for row_idx in range(1, sheet.nrows)):
                        document = {
                            feature_labels[col_idx]:
                            feature_converters[col_idx](cell.value)
                            for col_idx, cell in enumerate(excel_row)
                        }
                        document['_texta_id'] = '{0}_{1}'.format(
                            file_path, row_idx)
                        yield document

                except Exception as e:
                    HandleDatasetImportException(kwargs,
                                                 e,
                                                 file_path=file_path)
示例#6
0
    def get_features(**kwargs):

        directory = kwargs['directory']

        for file_path in TXTReader.get_file_list(directory, 'txt'):
            try:
                features = TXTReader.get_meta_features(file_path=file_path)

                with open(file_path, 'r', encoding='utf8') as text_file:
                    features['text'] = text_file.read()

                features['_texta_id'] = file_path
                yield features

            except Exception as e:
                HandleDatasetImportException(kwargs, e, file_path=file_path)
示例#7
0
    def get_features(**kwargs):

        directory = kwargs['directory']
        file_list = CollectionReader.get_file_list(
            directory, 'jsonl') + CollectionReader.get_file_list(
                directory, 'jl')
        for file_path in file_list:
            with open(file_path, 'r', encoding='utf8') as json_file:
                for line in json_file:
                    try:
                        features = json.loads(line.strip())
                        yield features

                    except Exception as e:
                        HandleDatasetImportException(kwargs,
                                                     e,
                                                     file_path=file_path)
示例#8
0
	def get_features(**kwargs):
		select_query = kwargs['sqlite_select_query']
		directory = kwargs['directory']

		for file_path in SQLiteReader.get_file_list(root_directory=directory):
			try:
				with sqlite3.connect(file_path) as connection:
					connection.row_factory = dict_factory
					cursor = connection.cursor()
					cursor.execute(select_query)

					value = cursor.fetchone()
					while value:
						yield value
						value = cursor.fetchone()

			except Exception as e:
				HandleDatasetImportException(kwargs, e, file_path=file_path)
示例#9
0
 def get_features(**kwargs):
     try:
         raise NotImplementedError()
     except Exception as e:
         HandleDatasetImportException(kwargs, e, file_path='')