Exemplo n.º 1
0
    def __init__(self, config=None, solrClient=None):
        self._logger = logging.getLogger(__name__)

        if self.linguistic_processor is None:
            self.linguistic_processor = LinguisticPreprocessor()

        if config is None:
            import configparser
            config = configparser.ConfigParser()
            config.read(
                os.path.join(os.path.dirname(__file__), '..', 'config',
                             'config'))

        try:
            self.pos_sequences_file = config['DEFAULT']['pos_sequence_filter']
        except KeyError:
            self._logger.exception(
                "Oops! 'pos_sequence_filter' is not found in config file.")
            raise Exception(
                "Please check 'pos_sequence_filter' is properly configured!")
        try:
            self.solr_core_url = config['DEFAULT']['solr_core_url']
        except KeyError:
            errMsg = "Target index url 'solr_core_url' is not configured in config file. Use default index directory instead."
            self._logger.exception(errMsg)
            raise Exception(errMsg)

        try:
            self._max_tokens = int(config['DEFAULT']['max_tokens'])
        except KeyError:
            errMsg = "'max_tokens' is not configured in config file. Default as 6 instead."
            self._logger.warn(errMsg)
            self._max_tokens = 6

        try:
            self._min_tokens = int(config['DEFAULT']['min_tokens'])
        except KeyError:
            errMsg = "'min_tokens' is not configured in config file. Default as 1 instead."
            self._logger.warn(errMsg)
            self._min_tokens = 6

        try:
            self._min_char_length = int(config['DEFAULT']['min_char_length'])
        except KeyError:
            errMsg = "'min_char_length' is not configured in config file. Default as 2 instead."
            self._logger.warn(errMsg)
            self._min_char_length = 2

        try:
            self._min_term_freq = int(config['DEFAULT']['min_term_freq'])
        except KeyError:
            errMsg = "'min_term_freq' is not configured in config file. Default is 1 instead."
            self._logger.warning(errMsg)
            self._min_term_freq = 1

        try:
            self.solr_field_content = config['DEFAULT']['solr_field_content']
        except KeyError:
            errMsg = "'solr_field_content' is not configured in config file. Default field name is 'content'"
            self._logger.warning(errMsg)
            self.solr_field_content = "content"

        if len(self.stopword_list) == 0:
            from nltk.corpus import stopwords
            self.stopword_list = set()
            #The union operator is much faster than add
            self.stopword_list |= set(stopwords.words('english'))
            try:
                customised_stopword_file = config['DEFAULT']['stopwords']
            except KeyError:
                errMsg = "Oops! customisable stopword file is not found in config file. Use default english stopword list instead!"
                self._logger.error(errMsg)

            smart_stopword_list = os.path.join(os.path.dirname(__file__), '..',
                                               'config', 'smart-stop-list.txt')
            if (customised_stopword_file is not None):
                self.stopword_list |= set(
                    read_by_line(customised_stopword_file))
                self.stopword_list |= set(read_by_line(smart_stopword_list))

            self._logger.debug("final stopword size: [%s]",
                               len(self.stopword_list))

            #dict_term will be loaded for dictionary matching
            #if len(self.dict_terms) == 0:
            #    self.dict_terms=set()

        if solrClient is None:
            from SolrClient import SolrClient
            self.solrClient = SolrClient(self.solr_core_url)
        else:
            self.solrClient = solrClient

        self.load_dictionary_tagging_setting(config)

        try:
            self.parallel_workers = config['DEFAULT']['PARALLEL_WORKERS']
        except KeyError:
            self._logger.exception(
                "Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead."
            )
            #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!")
            self.parallel_workers = 1
Exemplo n.º 2
0
 def __init__(self, config=None, solrClient=None):
     self._logger=logging.getLogger(__name__)
     
     if self.linguistic_processor is None:            
         self.linguistic_processor = LinguisticPreprocessor()
         
     if config is None:
         import configparser
         config = configparser.ConfigParser()
         config.read(os.path.join(os.path.dirname(__file__), '..', 'config','config'))
     
     try:
         self.pos_sequences_file=config['DEFAULT']['pos_sequence_filter']
     except KeyError:
         self._logger.exception("Oops! 'pos_sequence_filter' is not found in config file.")
         raise Exception("Please check 'pos_sequence_filter' is properly configured!")            
     try:
         self.solr_core_url=config['DEFAULT']['solr_core_url']
     except KeyError:
         errMsg="Target index url 'solr_core_url' is not configured in config file. Use default index directory instead."
         self._logger.exception(errMsg)
         raise Exception(errMsg)
     
     try:
         self._max_tokens=int(config['DEFAULT']['max_tokens'])
     except KeyError:
         errMsg="'max_tokens' is not configured in config file. Default as 6 instead."
         self._logger.warn(errMsg)
         self._max_tokens=6
     
     try:
         self._min_tokens=int(config['DEFAULT']['min_tokens'])
     except KeyError:
         errMsg="'min_tokens' is not configured in config file. Default as 1 instead."
         self._logger.warn(errMsg)
         self._min_tokens=6
     
     try:
         self._min_char_length=int(config['DEFAULT']['min_char_length'])
     except KeyError:
         errMsg="'min_char_length' is not configured in config file. Default as 2 instead."
         self._logger.warn(errMsg)
         self._min_char_length=2
         
     try:
         self._min_term_freq=int(config['DEFAULT']['min_term_freq'])
     except KeyError:
         errMsg="'min_term_freq' is not configured in config file. Default is 1 instead."
         self._logger.warning(errMsg)
         self._min_term_freq=1
     
     try:
         self.solr_field_content=config['DEFAULT']['solr_field_content']
     except KeyError:
         errMsg="'solr_field_content' is not configured in config file. Default field name is 'content'"
         self._logger.warning(errMsg)
         self.solr_field_content="content"
     
     if len(self.stopword_list) == 0 :
         from nltk.corpus import stopwords
         self.stopword_list=set()
         #The union operator is much faster than add
         self.stopword_list |= set(stopwords.words('english'))
         try:
             customised_stopword_file=config['DEFAULT']['stopwords']
         except KeyError:
             errMsg="Oops! customisable stopword file is not found in config file. Use default english stopword list instead!"
             self._logger.error(errMsg)
         
         smart_stopword_list=os.path.join(os.path.dirname(__file__), '..','config','smart-stop-list.txt')
         if (customised_stopword_file is not None):                    
             self.stopword_list |= set(read_by_line(customised_stopword_file))
             self.stopword_list |= set(read_by_line(smart_stopword_list))
         
         self._logger.debug("final stopword size: [%s]", len(self.stopword_list))
         
         #dict_term will be loaded for dictionary matching
         #if len(self.dict_terms) == 0:
         #    self.dict_terms=set()
         
     if solrClient is None:
         from SolrClient import SolrClient
         self.solrClient=SolrClient(self.solr_core_url)
     else:
         self.solrClient=solrClient
     
     self.load_dictionary_tagging_setting(config)
     
     try:
         self.parallel_workers=config['DEFAULT']['PARALLEL_WORKERS']
     except KeyError:
         self._logger.exception("Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead.")
         #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!")
         self.parallel_workers = 1    
Exemplo n.º 3
0
class TaggingProcessor(object):
    pos_sequences_file = ""
    linguistic_processor = None
    stopword_list = set()
    _logger = None
    dict_terms = set()
    steel_term_dict_terms = set()

    def __init__(self, config=None, solrClient=None):
        self._logger = logging.getLogger(__name__)

        if self.linguistic_processor is None:
            self.linguistic_processor = LinguisticPreprocessor()

        if config is None:
            import configparser
            config = configparser.ConfigParser()
            config.read(
                os.path.join(os.path.dirname(__file__), '..', 'config',
                             'config'))

        try:
            self.pos_sequences_file = config['DEFAULT']['pos_sequence_filter']
        except KeyError:
            self._logger.exception(
                "Oops! 'pos_sequence_filter' is not found in config file.")
            raise Exception(
                "Please check 'pos_sequence_filter' is properly configured!")
        try:
            self.solr_core_url = config['DEFAULT']['solr_core_url']
        except KeyError:
            errMsg = "Target index url 'solr_core_url' is not configured in config file. Use default index directory instead."
            self._logger.exception(errMsg)
            raise Exception(errMsg)

        try:
            self._max_tokens = int(config['DEFAULT']['max_tokens'])
        except KeyError:
            errMsg = "'max_tokens' is not configured in config file. Default as 6 instead."
            self._logger.warn(errMsg)
            self._max_tokens = 6

        try:
            self._min_tokens = int(config['DEFAULT']['min_tokens'])
        except KeyError:
            errMsg = "'min_tokens' is not configured in config file. Default as 1 instead."
            self._logger.warn(errMsg)
            self._min_tokens = 6

        try:
            self._min_char_length = int(config['DEFAULT']['min_char_length'])
        except KeyError:
            errMsg = "'min_char_length' is not configured in config file. Default as 2 instead."
            self._logger.warn(errMsg)
            self._min_char_length = 2

        try:
            self._min_term_freq = int(config['DEFAULT']['min_term_freq'])
        except KeyError:
            errMsg = "'min_term_freq' is not configured in config file. Default is 1 instead."
            self._logger.warning(errMsg)
            self._min_term_freq = 1

        try:
            self.solr_field_content = config['DEFAULT']['solr_field_content']
        except KeyError:
            errMsg = "'solr_field_content' is not configured in config file. Default field name is 'content'"
            self._logger.warning(errMsg)
            self.solr_field_content = "content"

        if len(self.stopword_list) == 0:
            from nltk.corpus import stopwords
            self.stopword_list = set()
            #The union operator is much faster than add
            self.stopword_list |= set(stopwords.words('english'))
            try:
                customised_stopword_file = config['DEFAULT']['stopwords']
            except KeyError:
                errMsg = "Oops! customisable stopword file is not found in config file. Use default english stopword list instead!"
                self._logger.error(errMsg)

            smart_stopword_list = os.path.join(os.path.dirname(__file__), '..',
                                               'config', 'smart-stop-list.txt')
            if (customised_stopword_file is not None):
                self.stopword_list |= set(
                    read_by_line(customised_stopword_file))
                self.stopword_list |= set(read_by_line(smart_stopword_list))

            self._logger.debug("final stopword size: [%s]",
                               len(self.stopword_list))

            #dict_term will be loaded for dictionary matching
            #if len(self.dict_terms) == 0:
            #    self.dict_terms=set()

        if solrClient is None:
            from SolrClient import SolrClient
            self.solrClient = SolrClient(self.solr_core_url)
        else:
            self.solrClient = solrClient

        self.load_dictionary_tagging_setting(config)

        try:
            self.parallel_workers = config['DEFAULT']['PARALLEL_WORKERS']
        except KeyError:
            self._logger.exception(
                "Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead."
            )
            #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!")
            self.parallel_workers = 1

    def load_dictionary_tagging_setting(self, config):
        try:
            self.dict_tagging = config['DICTIONARY_TAGGER']['dict_tagging']
            if "true" == self.dict_tagging.lower():
                self.dict_tagging = True
            elif "false" == self.dict_tagging.lower():
                self.dict_tagging = False
            else:
                raise Exception(
                    "current setting [%s] for 'dict_tagging' is not supported!"
                    % self.tagging)
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagging' is set incorrectly in config file. Default to set false"
            )
            self.dict_tagging = False

        if not self.dict_tagging:
            self._logger.info(
                "dictionary tagging is set to false. Disable dictionary tagging."
            )
            return

        self._logger.info("Dictionary tagging is enabled.")

        try:
            self.dictionary_file = config['DICTIONARY_TAGGER'][
                'dictionary_file']
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagging' is set incorrectly in config file. Default to use default csv file in config dir."
            )
            self.dictionary_file = os.path.join(
                os.path.dirname(__file__), '..', 'config',
                'Steel-Terminology-Tata-Steel.csv')

        try:
            self.dict_tagger_fuzzy_matching = config['DICTIONARY_TAGGER'][
                'dict_tagger_fuzzy_matching']
            if "true" == self.dict_tagger_fuzzy_matching.lower():
                self.dict_tagger_fuzzy_matching = True
            elif "false" == self.dict_tagger_fuzzy_matching.lower():
                self.dict_tagger_fuzzy_matching = False
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagger_fuzzy_matching' is set incorrectly in config file. Default to False."
            )
            self.dict_tagger_fuzzy_matching = False

        try:
            self.dict_tagger_sim_threshold = float(
                config['DICTIONARY_TAGGER']['dict_tagger_sim_threshold'])
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagger_sim_threshold' is set incorrectly in config file. Default to 0.95."
            )
            self.dict_tagger_sim_threshold = float(0.95)

        self.dict_terms = load_terms_from_csv(self.dictionary_file)

        self._logger.info("normalising terms from dictionary...")
        self.dict_terms = [
            self.solrClient.get_industry_term_field_analysis(dict_term)
            for dict_term in self.dict_terms
        ]
        self._logger.info(
            "dictionary terms are normalised and loaded successfully. Total dictionary term size is [%s]",
            str(len(self.dict_terms)))

        if self.dict_tagger_fuzzy_matching:
            self._logger.info("loading into Trie nodes for fuzzy matching...")
            self.dict_terms_trie = TrieNode()
            [
                self.dict_terms_trie.insert(normed_term)
                for normed_term in self.dict_terms
            ]
            self._logger.info("loaded into Trie nodes successfully.")
        else:
            self.dict_terms_trie = TrieNode()

    def load_grammars(self):
        grammars = []

        pos_sequences = read_by_line(self.pos_sequences_file)
        for sequence_str in pos_sequences:
            grammars.append(sequence_str.replace('\n', '').strip())

        return grammars

    def parsing_candidates_regexp(self, text_pos_tokens, candidate_grammar):
        cp = nltk.RegexpParser(candidate_grammar)

        candidate_chunk = cp.parse(text_pos_tokens)
        term_candidates = set()
        for node_a in candidate_chunk:
            if type(node_a) is nltk.Tree:
                if node_a.label() == 'TermCandidate':
                    term_tokens = []
                    for node_b in node_a:
                        if node_b[0] == '"':
                            #TODO: find a more elegant way to deal with spurious POS tagging for quotes
                            continue
                        if node_b[1] == 'POS':
                            term_tokens.append(node_b[0])
                        elif node_b[1] == 'DT':
                            #only append if DT is in the middle,e.g., ratio of the tensile
                            term_tokens.append('' if len(term_tokens) ==
                                               0 else node_b[0])
                            #continue
                        else:
                            term_tokens.append('' if len(term_tokens) ==
                                               0 else ' ')
                            term_tokens.append(node_b[0])

                    term_candidates.add(''.join(term_tokens))
        return term_candidates

    def sentence_split(self, content):
        """
        heuristic/pattern (e.g., by '\r\n' or '\t') based sentence splitting + NLTK's recommended sentence tokenizer         
        return list, sentence list
        """
        pattern_split = re.compile(r"[\r\n|\t]")
        sent_list = pattern_split.split(content.strip())

        sent_list = [
            sent_tokenize(sent.strip()) for sent in sent_list if sent.strip()
        ]
        #flatten sentence list
        sent_list = [item for sublist in sent_list for item in sublist]
        return sent_list

    def term_dictionary_tagging(self, doc_id):
        """
        tagging content with the statistic dictionary 
        return set, term set to be indexed
        """

        self._logger.debug("term dictionary tagging for single document ...")

        indexed_terms = self.solrClient.query_indexed_terms_by_docId(
            doc_id, self.solr_field_content)
        indexed_terms = set(indexed_terms.keys())

        with MultiprocPool(processes=int(self.parallel_workers)) as pool:
            tagged_terms = pool.starmap(
                term_async_comparison,
                [(indexed_term, self.dict_terms,
                  self.dict_tagger_fuzzy_matching, self.dict_terms_trie,
                  self.dict_tagger_sim_threshold)
                 for indexed_term in indexed_terms])

        tagged_terms = set(filter(None, set(tagged_terms)))
        self._logger.debug("final dictionary tagged terms size: [%s]",
                           str(len(tagged_terms)))

        self._logger.debug(
            "Term candidate extraction for current doc is completed.")
        return tagged_terms

    def term_candidate_extraction(self, content):
        """
        Sentence based term candidates extraction. The content need to be tokenised and sentence splitted before parsing.
        params:
            content: content string to be analysed
        return set, term candidates extracted from content
        """
        self._logger.debug("term candidate extraction for single document...")

        term_candidates = set()
        grammars = [
            'TermCandidate: {' + item + '}' for item in self.load_grammars()
            if not item.startswith('#')
        ]

        sent_tokenize_list = self.sentence_split(content)

        for sent_content in sent_tokenize_list:
            pos_sent_content = self.linguistic_processor.customised_preprocessing(
                sent_content)
            # print(pos_sent_content)
            for candidate_grammar in grammars:
                pos_filter_candidates = self.parsing_candidates_regexp(
                    pos_sent_content, candidate_grammar)
                term_candidates.update(pos_filter_candidates)

        self._logger.debug("term_candidates size after PoS filtering: [%s]",
                           len(term_candidates))
        term_candidates = self.linguistic_filter(term_candidates)
        # print(term_candidates)
        term_candidates = self.frequency_filtering(term_candidates)

        self._logger.debug(
            "Term candidate extraction for current doc is completed.")
        return term_candidates

    def frequency_filtering(self, term_candidates):
        """
        Corpus (whole index) based frequency filtering
        
        params:
            term_candidates: set()
        
        return set, filtered term candidates
        """

        self._logger.debug(
            "term frequency filtering for candidates [%s] by min frequency [%s]  ...",
            str(len(term_candidates)), str(self._min_term_freq))
        filtered_term_candidates = set()

        terms_ttf_dict, normed_terms_dict = self.solrClient.totaltermfreq(
            self.solr_field_content, term_candidates)

        if self._min_term_freq > 1:
            for term in term_candidates:
                tc_ttf = self.get_term_ttf(normed_terms_dict[term],
                                           terms_ttf_dict)
                if tc_ttf == 0:
                    self._logger.warning(
                        "Error!! term [%s] has no ttf value. Please check tokenisation method for irregular text or the shingling range for the min and max value.",
                        term)
                if tc_ttf > self._min_term_freq:
                    filtered_term_candidates.add(term)

        self._logger.debug(
            "current term candidate size after frequency filtering [%s]",
            str(len(filtered_term_candidates)))
        return filtered_term_candidates

    def get_term_ttf(self, term, ttf_dict):
        """
        get term ttf value from a given ttf dictionary returned from SolrClient.totaltermfreq
        return ttf numerical value
        """
        return ttf_dict[term]

    def check_min_char_limit(self, multiword_term):
        """
        return True if none of term unit length less than minimum char length
        """
        is_exist_min_char = 0

        for token in multiword_term.split(' '):
            if len(token) < self._min_char_length:
                is_exist_min_char += 1

        if is_exist_min_char > 0:
            return False

        return True

    def linguistic_filter(self, candidate_set=set()):
        """
        linguistic based term candidates filtering
        
        1) stopword based filtering: less aggressive stop word filtering
        2) ngram range filtering
        3) minimum character filtering: none of term unit length less than minimum char length
        
        """
        #TODO: check how many gold standards can be filtered
        self._logger.debug("linguistic filtering ...")
        self._logger.debug(
            "stopword size: [%s], minimum tokens allowed: [%s], maximum tokens allowed [%s], min character allowed: [%s]",
            str(len(self.stopword_list)), str(self._min_tokens),
            str(self._max_tokens), str(self._min_char_length))
        # filter by matching with a stopwords
        # use the word_lower_func to lowercase the words to do stopword match
        #   except symbolic character is uppercase (for e.g., Longitudinal S prints, US)
        word_lower_func = lambda w: re.escape(w.lower()) if len(w) > 2 else w
        resultSet = set([
            x for x in candidate_set if any(
                word_lower_func(word) in self.stopword_list
                for word in x.split()) is False
        ])

        # add back filtered results by removing first stopword
        stopword_filtered_resultSet = candidate_set - resultSet
        # print("stopword_filtered_resultSet:", stopword_filtered_resultSet)
        first_word_striped_resultset = [
            ' '.join(term.split()[1:]) for term in stopword_filtered_resultSet
            if len(term.split()) > 1
            and word_lower_func(term.split()[0]) in self.stopword_list
        ]
        # print("add back striped results:", first_word_striped_resultset)
        resultSet.update(first_word_striped_resultset)
        # print("results after stopwords filtering:", resultSet)

        resultSet = set([
            x for x in resultSet if len(x.split()) >= self._min_tokens
            and len(x.split()) <= self._max_tokens
        ])

        if self._min_char_length > 1:
            resultSet = set(
                [x for x in resultSet if self.check_min_char_limit(x)])

        resultSet = set(filter(None, resultSet))
        self._logger.debug(
            "linguistic filtering is completed. current candidate size [%s]",
            len(resultSet))

        #TODO: filter common noun (a noun that can be found in WordNet)
        # refer to C. Arora, M. Sabetzadeh, F. Zimmer, and P. Werner, "Improving Requirements Glossary Construction via Clustering : Approach and Industrial Case Studies," 2014.
        # Single-word common nouns typically either constitute general knowledge or do not convey any special meaning outside their context.
        # We retain as a candidate term any single-word noun that is not found in the dictionary as well as any single-word common noun that is capitalized ,these nouns are likely to denote abbreviations and proper nouns
        return resultSet
Exemplo n.º 4
0
class TaggingProcessor(object):
    pos_sequences_file=""
    linguistic_processor=None
    stopword_list=set()
    _logger=None
    dict_terms=set()
    steel_term_dict_terms=set()
    
    def __init__(self, config=None, solrClient=None):
        self._logger=logging.getLogger(__name__)
        
        if self.linguistic_processor is None:            
            self.linguistic_processor = LinguisticPreprocessor()
            
        if config is None:
            import configparser
            config = configparser.ConfigParser()
            config.read(os.path.join(os.path.dirname(__file__), '..', 'config','config'))
        
        try:
            self.pos_sequences_file=config['DEFAULT']['pos_sequence_filter']
        except KeyError:
            self._logger.exception("Oops! 'pos_sequence_filter' is not found in config file.")
            raise Exception("Please check 'pos_sequence_filter' is properly configured!")            
        try:
            self.solr_core_url=config['DEFAULT']['solr_core_url']
        except KeyError:
            errMsg="Target index url 'solr_core_url' is not configured in config file. Use default index directory instead."
            self._logger.exception(errMsg)
            raise Exception(errMsg)
        
        try:
            self._max_tokens=int(config['DEFAULT']['max_tokens'])
        except KeyError:
            errMsg="'max_tokens' is not configured in config file. Default as 6 instead."
            self._logger.warn(errMsg)
            self._max_tokens=6
        
        try:
            self._min_tokens=int(config['DEFAULT']['min_tokens'])
        except KeyError:
            errMsg="'min_tokens' is not configured in config file. Default as 1 instead."
            self._logger.warn(errMsg)
            self._min_tokens=6
        
        try:
            self._min_char_length=int(config['DEFAULT']['min_char_length'])
        except KeyError:
            errMsg="'min_char_length' is not configured in config file. Default as 2 instead."
            self._logger.warn(errMsg)
            self._min_char_length=2
            
        try:
            self._min_term_freq=int(config['DEFAULT']['min_term_freq'])
        except KeyError:
            errMsg="'min_term_freq' is not configured in config file. Default is 1 instead."
            self._logger.warning(errMsg)
            self._min_term_freq=1
        
        try:
            self.solr_field_content=config['DEFAULT']['solr_field_content']
        except KeyError:
            errMsg="'solr_field_content' is not configured in config file. Default field name is 'content'"
            self._logger.warning(errMsg)
            self.solr_field_content="content"
        
        if len(self.stopword_list) == 0 :
            from nltk.corpus import stopwords
            self.stopword_list=set()
            #The union operator is much faster than add
            self.stopword_list |= set(stopwords.words('english'))
            try:
                customised_stopword_file=config['DEFAULT']['stopwords']
            except KeyError:
                errMsg="Oops! customisable stopword file is not found in config file. Use default english stopword list instead!"
                self._logger.error(errMsg)
            
            smart_stopword_list=os.path.join(os.path.dirname(__file__), '..','config','smart-stop-list.txt')
            if (customised_stopword_file is not None):                    
                self.stopword_list |= set(read_by_line(customised_stopword_file))
                self.stopword_list |= set(read_by_line(smart_stopword_list))
            
            self._logger.debug("final stopword size: [%s]", len(self.stopword_list))
            
            #dict_term will be loaded for dictionary matching
            #if len(self.dict_terms) == 0:
            #    self.dict_terms=set()
            
        if solrClient is None:
            from SolrClient import SolrClient
            self.solrClient=SolrClient(self.solr_core_url)
        else:
            self.solrClient=solrClient
        
        self.load_dictionary_tagging_setting(config)
        
        try:
            self.parallel_workers=config['DEFAULT']['PARALLEL_WORKERS']
        except KeyError:
            self._logger.exception("Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead.")
            #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!")
            self.parallel_workers = 1    
        
    def load_dictionary_tagging_setting(self, config):
        try:
            self.dict_tagging = config['DICTIONARY_TAGGER']['dict_tagging']
            if "true" == self.dict_tagging.lower():
                self.dict_tagging = True
            elif "false" == self.dict_tagging.lower():
                self.dict_tagging = False
            else:
                raise Exception("current setting [%s] for 'dict_tagging' is not supported!"%self.tagging)
        except KeyError:
            self._logger.exception("Oops! 'dict_tagging' is set incorrectly in config file. Default to set false")
            self.dict_tagging = False
        
        if not self.dict_tagging:
            self._logger.info("dictionary tagging is set to false. Disable dictionary tagging.")
            return
        
        self._logger.info("Dictionary tagging is enabled.")
        
        try:
            self.dictionary_file = config['DICTIONARY_TAGGER']['dictionary_file']
        except KeyError:
            self._logger.exception("Oops! 'dict_tagging' is set incorrectly in config file. Default to use default csv file in config dir.")
            self.dictionary_file = os.path.join(os.path.dirname(__file__), '..','config','Steel-Terminology-Tata-Steel.csv')
        
        
        try:
            self.dict_tagger_fuzzy_matching=config['DICTIONARY_TAGGER']['dict_tagger_fuzzy_matching']
            if "true" == self.dict_tagger_fuzzy_matching.lower():
                self.dict_tagger_fuzzy_matching = True
            elif "false" == self.dict_tagger_fuzzy_matching.lower():
                self.dict_tagger_fuzzy_matching = False
        except KeyError:
            self._logger.exception("Oops! 'dict_tagger_fuzzy_matching' is set incorrectly in config file. Default to False.")
            self.dict_tagger_fuzzy_matching=False
        
        try:
            self.dict_tagger_sim_threshold=float(config['DICTIONARY_TAGGER']['dict_tagger_sim_threshold'])
        except KeyError:
            self._logger.exception("Oops! 'dict_tagger_sim_threshold' is set incorrectly in config file. Default to 0.95.")
            self.dict_tagger_sim_threshold=float(0.95)
        
        self.dict_terms = load_terms_from_csv(self.dictionary_file)
        
        self._logger.info("normalising terms from dictionary...")
        self.dict_terms = [self.solrClient.get_industry_term_field_analysis(dict_term) for dict_term in self.dict_terms]
        self._logger.info("dictionary terms are normalised and loaded successfully. Total dictionary term size is [%s]", str(len(self.dict_terms)))
        
        if self.dict_tagger_fuzzy_matching:
            self._logger.info("loading into Trie nodes for fuzzy matching...")
            self.dict_terms_trie = TrieNode()
            [self.dict_terms_trie.insert(normed_term) for normed_term in self.dict_terms]
            self._logger.info("loaded into Trie nodes successfully.")
        else:
            self.dict_terms_trie = TrieNode()
        
    def load_grammars(self):
        grammars=[]
        
        pos_sequences = read_by_line(self.pos_sequences_file)
        for sequence_str in pos_sequences:
            grammars.append(sequence_str.replace('\n','').strip())
        
        return grammars
    
    def parsing_candidates_regexp(self, text_pos_tokens,candidate_grammar):
        cp = nltk.RegexpParser(candidate_grammar)
        
        candidate_chunk=cp.parse(text_pos_tokens)    
        term_candidates=set()
        for node_a in candidate_chunk:
            if type(node_a) is nltk.Tree:
                if node_a.label() == 'TermCandidate':
                    term_tokens=[]
                    for node_b in node_a:
                        if node_b[0] == '"':
                            #TODO: find a more elegant way to deal with spurious POS tagging for quotes
                            continue
                        if node_b[1] == 'POS':
                            term_tokens.append(node_b[0])
                        elif node_b[1] == 'DT':
                            #only append if DT is in the middle,e.g., ratio of the tensile
                            term_tokens.append('' if len(term_tokens) == 0 else node_b[0])
                            #continue
                        else:
                            term_tokens.append('' if len(term_tokens) == 0 else ' ')
                            term_tokens.append(node_b[0])
                    
                    term_candidates.add(''.join(term_tokens))
        return term_candidates
    
    def sentence_split(self, content):
        """
        heuristic/pattern (e.g., by '\r\n' or '\t') based sentence splitting + NLTK's recommended sentence tokenizer         
        return list, sentence list
        """
        pattern_split = re.compile(r"[\r\n|\t]")
        sent_list = pattern_split.split(content.strip())
        
        
        sent_list = [sent_tokenize(sent.strip()) for sent in sent_list if sent.strip()]
        #flatten sentence list
        sent_list = [item for sublist in sent_list for item in sublist]
        return sent_list
    
    def term_dictionary_tagging(self, doc_id):
        """
        tagging content with the statistic dictionary 
        return set, term set to be indexed
        """

        self._logger.debug("term dictionary tagging for single document ...")
        
        indexed_terms = self.solrClient.query_indexed_terms_by_docId(doc_id, self.solr_field_content)
        indexed_terms = set(indexed_terms.keys())
        
        with MultiprocPool(processes=int(self.parallel_workers)) as pool:
            tagged_terms=pool.starmap(term_async_comparison, [(indexed_term, self.dict_terms, self.dict_tagger_fuzzy_matching, self.dict_terms_trie, self.dict_tagger_sim_threshold) for indexed_term in indexed_terms])
       
        tagged_terms = set(filter(None, set(tagged_terms)))
        self._logger.debug("final dictionary tagged terms size: [%s]", str(len(tagged_terms)))
            
        self._logger.debug("Term candidate extraction for current doc is completed.")
        return tagged_terms
        
    def term_candidate_extraction(self,content):
        """
        Sentence based term candidates extraction. The content need to be tokenised and sentence splitted before parsing.
        params:
            content: content string to be analysed
        return set, term candidates extracted from content
        """
        self._logger.debug("term candidate extraction for single document...")
        
        term_candidates=set()
        grammars=['TermCandidate: {'+item+'}' for item in self.load_grammars() if not item.startswith('#')]
        
        sent_tokenize_list = self.sentence_split(content)
        
        for sent_content in sent_tokenize_list:
            pos_sent_content=self.linguistic_processor.customised_preprocessing(sent_content)
            # print(pos_sent_content)
            for candidate_grammar in grammars:
                pos_filter_candidates=self.parsing_candidates_regexp(pos_sent_content,candidate_grammar)
                term_candidates.update(pos_filter_candidates)
            
        self._logger.debug("term_candidates size after PoS filtering: [%s]", len(term_candidates))
        term_candidates = self.linguistic_filter(term_candidates)
        # print(term_candidates)
        term_candidates = self.frequency_filtering(term_candidates)
        
        self._logger.debug("Term candidate extraction for current doc is completed.")
        return term_candidates
    
    def frequency_filtering(self, term_candidates):
        """
        Corpus (whole index) based frequency filtering
        
        params:
            term_candidates: set()
        
        return set, filtered term candidates
        """

        self._logger.debug("term frequency filtering for candidates [%s] by min frequency [%s]  ...",str(len(term_candidates)), str(self._min_term_freq))
        filtered_term_candidates=set()
        
        terms_ttf_dict, normed_terms_dict= self.solrClient.totaltermfreq(self.solr_field_content, term_candidates)
        
        if self._min_term_freq > 1:
            for term in term_candidates:
                tc_ttf = self.get_term_ttf(normed_terms_dict[term], terms_ttf_dict)
                if tc_ttf == 0:
                    self._logger.warning("Error!! term [%s] has no ttf value. Please check tokenisation method for irregular text or the shingling range for the min and max value.", term)
                if tc_ttf >  self._min_term_freq:
                    filtered_term_candidates.add(term)
                
        self._logger.debug("current term candidate size after frequency filtering [%s]", str(len(filtered_term_candidates)))
        return filtered_term_candidates

    def get_term_ttf(self, term, ttf_dict):
        """
        get term ttf value from a given ttf dictionary returned from SolrClient.totaltermfreq
        return ttf numerical value
        """
        return ttf_dict[term]
    
    def check_min_char_limit(self, multiword_term):
        """
        return True if none of term unit length less than minimum char length
        """
        is_exist_min_char=0
        
        for token in multiword_term.split(' '):
            if len(token) < self._min_char_length:
                is_exist_min_char+=1
                
        if is_exist_min_char > 0:
            return False
        
        return True
    
    def linguistic_filter(self, candidate_set=set()):
        """
        linguistic based term candidates filtering
        
        1) stopword based filtering: less aggressive stop word filtering
        2) ngram range filtering
        3) minimum character filtering: none of term unit length less than minimum char length
        
        """        
        #TODO: check how many gold standards can be filtered
        self._logger.debug("linguistic filtering ...")
        self._logger.debug("stopword size: [%s], minimum tokens allowed: [%s], maximum tokens allowed [%s], min character allowed: [%s]", str(len(self.stopword_list)), str(self._min_tokens), str(self._max_tokens), str(self._min_char_length))
        # filter by matching with a stopwords
        # use the word_lower_func to lowercase the words to do stopword match
        #   except symbolic character is uppercase (for e.g., Longitudinal S prints, US)
        word_lower_func=lambda w: re.escape(w.lower()) if len(w) > 2 else w
        resultSet= set([x for x in candidate_set if any(word_lower_func(word) in self.stopword_list for word in x.split()) is False])
        
        # add back filtered results by removing first stopword
        stopword_filtered_resultSet= candidate_set - resultSet
        # print("stopword_filtered_resultSet:", stopword_filtered_resultSet)
        first_word_striped_resultset=[' '.join(term.split()[1:]) for term in stopword_filtered_resultSet if len(term.split()) > 1 and word_lower_func(term.split()[0]) in self.stopword_list]
        # print("add back striped results:", first_word_striped_resultset)
        resultSet.update(first_word_striped_resultset)
        # print("results after stopwords filtering:", resultSet)
        
        resultSet= set([x for x in resultSet if len(x.split()) >= self._min_tokens and len(x.split()) <= self._max_tokens])
                
        if self._min_char_length >1:
            resultSet= set([x for x in resultSet if self.check_min_char_limit(x)])
        
        resultSet=set(filter(None,resultSet))
        self._logger.debug("linguistic filtering is completed. current candidate size [%s]", len(resultSet))
        
        #TODO: filter common noun (a noun that can be found in WordNet) 
        # refer to C. Arora, M. Sabetzadeh, F. Zimmer, and P. Werner, "Improving Requirements Glossary Construction via Clustering : Approach and Industrial Case Studies," 2014.
        # Single-word common nouns typically either constitute general knowledge or do not convey any special meaning outside their context. 
        # We retain as a candidate term any single-word noun that is not found in the dictionary as well as any single-word common noun that is capitalized ,these nouns are likely to denote abbreviations and proper nouns
        return resultSet