예제 #1
0
 def update_index(self, ref_gen):
     """Add the elements in ref_gen to an existing index.
     """
     testing = True
     logging.warning('Updating index')
     es_insert.index(es, ref_gen, self.index_name, testing, action="update")
     logging.warning('Finished updating')
    def create_index(self, ref_path, columns_to_index, force=False):
        '''
        INPUT:
            - ref_path: path to the file to index
            - columns_to_index
        '''

        # To solve http.client.HTTPException: got more than 100 headers
        import http
        http.client._MAXHEADERS = 1000

        testing = True

        ref_gen = pd.read_csv(ref_path,
                              usecols=columns_to_index.keys(),
                              dtype=str,
                              chunksize=self.es_insert_chunksize)

        if self.has_index() and (force or (not self.valid_index())):
            self.ic.delete(self.index_name)

        if not self.has_index():
            logging.info('Creating new index')
            log = self._init_active_log('INIT', 'transform')

            index_settings = es_insert.gen_index_settings(
                DEFAULT_ANALYZER, columns_to_index, INDEX_SETTINGS_TEMPLATE)

            logging.warning('Creating index')
            logging.warning(index_settings)

            self.ic.create(self.index_name, body=json.dumps(index_settings))
            logging.warning('Inserting in index')
            es_insert.index(es, ref_gen, self.index_name, testing)

            log = self._end_active_log(log, error=False)
            logging.warning('Finished indexing')
        else:
            logging.info('Index already exists')
        logging.info('Finished indexing')
        self.valid_index()
        self._write_log_buffer(written=False)
testing = True

if force_re_index or (not ic.exists(ref_table_name)):
    if ic.exists(ref_table_name):
        ic.delete(ref_table_name)

    ref_gen = pd.read_csv(ref_file_path,
                          usecols=columns_to_index.keys(),
                          dtype=str,
                          chunksize=40000)

    index_settings = es_insert.gen_index_settings(columns_to_index)

    ic.create(ref_table_name, body=json.dumps(index_settings))
    es_insert.index(ref_gen, ref_table_name, testing)

# =============================================================================
# Initiate the labellers
# =============================================================================

if test_num == 2:
    columns_certain_match = {'source': ['SIRET'], 'ref': ['SIRET']}
    labellers = dict()

    for i in range(3):
        labellers[i] = ConsoleLabeller(es, source, ref_table_name, match_cols,
                                       columns_to_index)
        labellers[i].auto_label(columns_certain_match)

#    import cProfile
예제 #4
0
force_re_index = True  # Usually set to false

# Create the index
es_insert.create_index(es,
                       ref_table_name,
                       columns_to_index,
                       default_analyzer=default_analyzer,
                       analyzer_definitions=ANALYZERS,
                       force=force_re_index)

# Insert documents in the index
ref_gen = pd.read_csv(ref_file_path,
                      usecols=columns_to_index.keys(),
                      dtype=str,
                      chunksize=40000)
es_insert.index(es, ref_gen, ref_table_name, testing=True)

# =============================================================================
# 3. Initiate the labeller
# =============================================================================

# -----------------------------------------------------------------------------
# NB.1:
# Enter `h` or `help` in
#
# NB.2:
# Advanced users may want to skip the labelling process and go directly to
# linking (step 6.) and enter custom parameters instead of learning them
#
# EX.1:
# For the provide example it might be usefull to add the following filters (=f)
예제 #5
0
    def create_index(self,
                     ref_path,
                     columns_to_index,
                     force=False,
                     no_delete=False):
        '''Index a csv file in Elasticsearch.
        
        Unless force is set to True, this method will check if an index already
        exists with a mapping that includes that requested by columns_to_index.
        If not it will delete the existing index and fully re-index
        
        # TODO: look into re-indexing a single column
        
        Parameters
        ----------
        ref_path: str
            path to the csv file to index.
        columns_to_index: dict like {col1: list_of_analyzers1, float_col: 'float' ...}
            The analyzers (or type if not string) to use for each column.
        force: bool
            Force deleting any existing index in all cases.
        no_delete: bool
            Prevent deleting if set to True unless force is also set to True.
        '''

        # To solve http.client.HTTPException: got more than 100 headers
        import http
        http.client._MAXHEADERS = 1000

        testing = True

        dtype = {
            col: self._choose_dtype(col)
            for col in columns_to_index.keys()
        }
        ref_gen = pd.read_csv(ref_path,
                              usecols=columns_to_index.keys(),
                              dtype=dtype,
                              chunksize=self.es_insert_chunksize)

        if self.has_index() and (force or (not self.valid_index())):
            print('[create_index] Deleting index')
            self.ic.delete(self.index_name)

        columns_to_index_str = {key: val for key, val in columns_to_index.items() \
                                if not isinstance(val, str)}
        if self.has_index() and (not no_delete):
            mapping = ic.get_mapping(self.project_id)[
                self.project_id]['mappings']['structure']['properties']
            for col, analyzers in columns_to_index_str.items():
                if any(
                        mapping.get(col, {'fields': None})['fields'].get(a) is
                        None for a in analyzers):
                    print('Mapping is: {0}\nCol: {1}\nAnalyzers:{2}'.format(
                        mapping, col, analyzers))
                    print('[create_index] Deleting index because of analyzers')
                    print('Missing:\n',
                          [(col, a) for a in analyzers if mapping.get(
                              col, {'fields': None})['fields'].get(a) is None])
                    logging.warning(
                        'create_index] Deleting index because of missing analyzers'
                    )
                    self.ic.delete(self.index_name)
                    break

        if not self.has_index():
            logging.info('Creating new index')
            log = self._init_active_log('INIT',
                                        'transform')  # TODO: is this right ?

            logging.warning('Creating index')
            es_insert.create_index(self.es,
                                   self.index_name,
                                   columns_to_index,
                                   default_analyzer=DEFAULT_ANALYZER,
                                   analyzer_definitions=ANALYZERS,
                                   force=force)

            logging.warning('Inserting in index')
            es_insert.index(es,
                            ref_gen,
                            self.index_name,
                            testing,
                            action='index')

            log = self._end_active_log(log, error=False)
            logging.warning('Finished indexing')
            time.sleep(5)  # TODO: why is this necessary?
        else:
            logging.info('Index already exists')

        logging.info('Finished indexing')
        self.valid_index()
        self._write_log_buffer(written=False)