def load_tagger(tType="standard", caseInsensitive=False): #Import Stanford tagger. global TAGGER #Tagger for standard German. if tType == "standard": TAGGER = tagger("./tagger/german-hgc_tiger_tueba.tagger", "./tagger/stanford-postagger-3.6.0.jar") #hgc-Tagger for German. elif tType == "hgc": TAGGER = tagger("./tagger/german-hgc.tagger", "./tagger/stanford-postagger-3.6.0.jar") #Special tagger for childrens' texts. elif tType == "kids" and not caseInsensitive: TAGGER = tagger("./tagger/german-bidirectional_kinder.tagger", "./tagger/stanford-postagger-3.6.0.jar") #Special tagger for childrens' texts ignoring lettercase. elif tType == "kids" and caseInsensitive: TAGGER = tagger("./tagger/german-bidirectional-caseless_kinder.tagger", "./tagger/stanford-postagger-3.6.0.jar") #No legal tagger type given. else: print( "Failed to load tagger. Tagger type needs to be one of 'standard', 'hgc', 'kids'." )
def extract(self, name, name_list, text, limit=1, is_wordnet=False): if not self.is_name_in_text(name_list, text): return {}, 0 tagList = self.tag_list stopwords = self.stops wordDict = {} filterd_dict = {} sents = segmenter(text) wordcount = 0 for sent in sents: tokens = tokenizer(sent.lower()) terms = tagger(tokens) for t in terms: wordcount += 1 key = '.'.join(t) try: wordDict[key] += 1 except KeyError: wordDict[key] = 1 for term_s, count in wordDict.items(): try: word, pos = term_s.split('.') except ValueError: continue if pos[:2] in tagList and word.lower() not in stopwords and len(word) >= 3: print word, pos if is_wordnet: meanList = self.abstract(word, pos, limit) for w in meanList: filterd_dict[term_s] = count else: filterd_dict[term_s] = count return filterd_dict, wordcount
def tag_sentences(sentences, pos_symbol=False): tokenized = [] for sent in sentences: tokenized.append(tokenizer(sent)) processed_list = tagger(tokenized) if not pos_symbol: output_list = [] for sentence in processed_list: new_sentence = [] for word in sentence: new_sentence.append((word[_IDX_WORD], POS_TAGS[word[_IDX_SYMBOL]])) output_list.append(new_sentence) else: output_list = processed_list return output_list
def extract(self, name, name_list, text, limit=1, is_wordnet=False): if not self.is_name_in_text(name_list, text): return {}, 0 tagList = self.tag_list stopwords = self.stops wordDict = {} filterd_dict = {} sents = segmenter(text) wordcount = 0 for sent in sents: tokens = tokenizer(sent.lower()) terms = tagger(tokens) for t in terms: wordcount += 1 key = '.'.join(t) try: wordDict[key] += 1 except KeyError: wordDict[key] = 1 for term_s, count in wordDict.items(): try: word, pos = term_s.split('.') except ValueError: continue if pos[:2] in tagList and word.lower( ) not in stopwords and len(word) >= 3: print word, pos if is_wordnet: meanList = self.abstract(word, pos, limit) for w in meanList: filterd_dict[term_s] = count else: filterd_dict[term_s] = count return filterd_dict, wordcount
def __init_tools(self): test = "Just a test not for printing out or other use " tagger(tokenizer(test)) segmenter(test) test = wn.synsets('test')
def semantic_rank(query, docids, index, repo): '''Utiliza a classe da palvras da query par verificar qual documento tem as mais palavras do tip ogramatica mais importante no texto Args: query: string com as plavras que o documento deve conter repo: Um dicionário que mapeia docid para uma lista de tokens. index: dicionario que mapeia palavra para um dicionario que mapeia docids para a contagem dessa palavra no documento. docids: lista de documentos para avaliar Returns: Uma lista com os documentos ordenados. ''' # Define as macro classes translator = { "AT": "articles", "IN": "preposition", "LS": "marker", "DT": "determinator", "POS": "genitives", "TO": "to", "UH": "interjection", "CC": "conjunction", "WDT": "wh", "WP": "wh", "WP$": "wh", "WRB": "wh", "EX": "there", "MD": "modal", "PDT": "pre-determiner", "RP": "particle", "PRP": "pronoun", "PRP$": "pronoun", "FW": "foreing word", "JJ": "adjective", "JJR": "adjective", "JJS": "adjective", "RB": "adverb", "RBR": "adverb", "RBS": "adverb", "VB": "verb", "VBG": "verb", "VBD": "verb", "VBN": "verb", "VBP": "verb", "VBZ": "verb", "NN": "noun", "NNS": "noun", "NNP": "proper noun", "NNPS": "proper noun", "CD": "number" } # Define os scores/2 das classes points = { "articles": 0, "marker": 0, "genitives": 0, "to": 0, "determinator": 0, "preposition": 0.05, "interjection": 0.05, "conjunction": 0.05, "wh": 0.05, "there": 0.05, "modal": 0.2, "pre-determiner": 0.1, "particle": 0.2, "pronoun": 0.1, "adjective": 0.3, "adverb": 0.3, "verb": 0.4, "noun": 0.4, "proper noun": 0.5, "foreing word": 0.5, "number": 0.5 } filtered_query = query.replace("(", "").replace(")", "").split(" ") tags = (tagger(filtered_query)) tags = {key: points[translator[value]] * 2 for key, value in tags} rank = defaultdict(int) n_words = defaultdict(int) for word in filtered_query: if word in index.keys(): for docid in index[word].keys(): if docid in docids: rank[docid] += tags[word] * index[word][docid] n_words[docid] += index[word][docid] rank = { docid: (rank[docid] / n_words[docid] if n_words[docid] != 0 else 0) for docid in docids } return rank