Пример #1
0
def load_tagger(tType="standard", caseInsensitive=False):
    #Import Stanford tagger.
    global TAGGER

    #Tagger for standard German.
    if tType == "standard":
        TAGGER = tagger("./tagger/german-hgc_tiger_tueba.tagger",
                        "./tagger/stanford-postagger-3.6.0.jar")
    #hgc-Tagger for German.
    elif tType == "hgc":
        TAGGER = tagger("./tagger/german-hgc.tagger",
                        "./tagger/stanford-postagger-3.6.0.jar")
    #Special tagger for childrens' texts.
    elif tType == "kids" and not caseInsensitive:
        TAGGER = tagger("./tagger/german-bidirectional_kinder.tagger",
                        "./tagger/stanford-postagger-3.6.0.jar")
    #Special tagger for childrens' texts ignoring lettercase.
    elif tType == "kids" and caseInsensitive:
        TAGGER = tagger("./tagger/german-bidirectional-caseless_kinder.tagger",
                        "./tagger/stanford-postagger-3.6.0.jar")
    #No legal tagger type given.
    else:
        print(
            "Failed to load tagger. Tagger type needs to be one of 'standard', 'hgc', 'kids'."
        )
Пример #2
0
    def extract(self, name, name_list, text, limit=1, is_wordnet=False):
        if not self.is_name_in_text(name_list, text):
            return {}, 0
        tagList = self.tag_list
        stopwords = self.stops
        wordDict = {}
        filterd_dict = {}
        sents = segmenter(text)
        wordcount = 0
        for sent in sents:
            tokens = tokenizer(sent.lower())
            terms = tagger(tokens)
            for t in terms:
                wordcount += 1
                key = '.'.join(t)
                try:
                    wordDict[key] += 1
                except KeyError:
                    wordDict[key] = 1

        for term_s, count in wordDict.items():
            try:
                word, pos = term_s.split('.')
            except ValueError:
                continue
            if pos[:2] in tagList and word.lower() not in stopwords and len(word) >= 3:
                print word, pos
                if is_wordnet:
                    meanList = self.abstract(word, pos, limit)
                    for w in meanList:
                        filterd_dict[term_s] = count
                else:
                    filterd_dict[term_s] = count

        return filterd_dict, wordcount
Пример #3
0
def tag_sentences(sentences, pos_symbol=False):
    tokenized = []
    for sent in sentences:
        tokenized.append(tokenizer(sent))

    processed_list = tagger(tokenized)

    if not pos_symbol:
        output_list = []
        for sentence in processed_list:
            new_sentence = []
            for word in sentence:
                new_sentence.append((word[_IDX_WORD], POS_TAGS[word[_IDX_SYMBOL]]))
            output_list.append(new_sentence)
    else:
        output_list = processed_list

    return output_list
Пример #4
0
    def extract(self, name, name_list, text, limit=1, is_wordnet=False):
        if not self.is_name_in_text(name_list, text):
            return {}, 0
        tagList = self.tag_list
        stopwords = self.stops
        wordDict = {}
        filterd_dict = {}
        sents = segmenter(text)
        wordcount = 0
        for sent in sents:
            tokens = tokenizer(sent.lower())
            terms = tagger(tokens)
            for t in terms:
                wordcount += 1
                key = '.'.join(t)
                try:
                    wordDict[key] += 1
                except KeyError:
                    wordDict[key] = 1

        for term_s, count in wordDict.items():
            try:
                word, pos = term_s.split('.')
            except ValueError:
                continue
            if pos[:2] in tagList and word.lower(
            ) not in stopwords and len(word) >= 3:
                print word, pos
                if is_wordnet:
                    meanList = self.abstract(word, pos, limit)
                    for w in meanList:
                        filterd_dict[term_s] = count
                else:
                    filterd_dict[term_s] = count

        return filterd_dict, wordcount
Пример #5
0
 def __init_tools(self):
     test = "Just a test not for printing out or other use "
     tagger(tokenizer(test))
     segmenter(test)
     test = wn.synsets('test')
Пример #6
0
 def __init_tools(self):
     test = "Just a test not for printing out or other use "
     tagger(tokenizer(test))
     segmenter(test)
     test = wn.synsets('test')
Пример #7
0
def semantic_rank(query, docids, index, repo):
    '''Utiliza a classe da palvras da query par verificar qual documento tem as 
    mais palavras do tip ogramatica mais importante no texto

    Args:
        query: string com as plavras que o documento deve conter
        repo: Um dicionário que mapeia docid para uma lista de tokens.
        index: dicionario que mapeia palavra para um dicionario que mapeia 
                docids para a contagem dessa palavra no documento.
        docids: lista de documentos para avaliar
        
    Returns:
        Uma lista com os documentos ordenados.
    '''
    # Define as macro classes
    translator = {
        "AT": "articles",
        "IN": "preposition",
        "LS": "marker",
        "DT": "determinator",
        "POS": "genitives",
        "TO": "to",
        "UH": "interjection",
        "CC": "conjunction",
        "WDT": "wh",
        "WP": "wh",
        "WP$": "wh",
        "WRB": "wh",
        "EX": "there",
        "MD": "modal",
        "PDT": "pre-determiner",
        "RP": "particle",
        "PRP": "pronoun",
        "PRP$": "pronoun",
        "FW": "foreing word",
        "JJ": "adjective",
        "JJR": "adjective",
        "JJS": "adjective",
        "RB": "adverb",
        "RBR": "adverb",
        "RBS": "adverb",
        "VB": "verb",
        "VBG": "verb",
        "VBD": "verb",
        "VBN": "verb",
        "VBP": "verb",
        "VBZ": "verb",
        "NN": "noun",
        "NNS": "noun",
        "NNP": "proper noun",
        "NNPS": "proper noun",
        "CD": "number"
    }

    # Define os scores/2 das classes
    points = {
        "articles": 0,
        "marker": 0,
        "genitives": 0,
        "to": 0,
        "determinator": 0,
        "preposition": 0.05,
        "interjection": 0.05,
        "conjunction": 0.05,
        "wh": 0.05,
        "there": 0.05,
        "modal": 0.2,
        "pre-determiner": 0.1,
        "particle": 0.2,
        "pronoun": 0.1,
        "adjective": 0.3,
        "adverb": 0.3,
        "verb": 0.4,
        "noun": 0.4,
        "proper noun": 0.5,
        "foreing word": 0.5,
        "number": 0.5
    }

    filtered_query = query.replace("(", "").replace(")", "").split(" ")
    tags = (tagger(filtered_query))
    tags = {key: points[translator[value]] * 2 for key, value in tags}

    rank = defaultdict(int)
    n_words = defaultdict(int)

    for word in filtered_query:
        if word in index.keys():
            for docid in index[word].keys():
                if docid in docids:
                    rank[docid] += tags[word] * index[word][docid]
                    n_words[docid] += index[word][docid]

    rank = {
        docid: (rank[docid] / n_words[docid] if n_words[docid] != 0 else 0)
        for docid in docids
    }

    return rank