Python default_tokenize_func示例，kilogram.lang.tokenize.default_tokenize_func Python示例

示例#1

0

显示文件

def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i + 20), i, -1):
                token = ' '.join(sentence[i:j])
                if i + 1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:' + uri + '>')
                    i = j - 1
                    break
            i += 1
        for n in range(1, N + 1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result

示例#2

0

显示文件

文件： spark_generate_linked_ngrams.py 项目： XI-lab/kilogram

def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:'+uri+'>')
                    i = j-1
                    break
            i += 1
        for n in range(1, N+1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result

示例#3

0

显示文件

文件： generate_organic_label_counts_all.py 项目： dragoon/kilogram

def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    total_count = sum(int(c) for _, c in uri_counts)
    print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))

示例#4

0

显示文件

文件： generate_organic_label_counts_all.py 项目： XI-lab/kilogram

def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    total_count = sum(int(c) for _, c in uri_counts)
    print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))

示例#5

0

显示文件

def link(sentence):
    tokens = default_tokenize_func(sentence)
    pos_tokens = nltk.pos_tag(tokens)
    candidates = extract_candidates(pos_tokens)
    if len(candidates) > 0:
        graph = SemanticGraph(candidates)
        graph.do_iterative_removal()
        graph.do_linking()
    return candidates

示例#6

0

显示文件

文件： generate_organic_label_counts.py 项目： XI-lab/kilogram

def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    if len(uri_counts) > 1:
        return
    uri, count = uri_counts[0]
    print(label + '\t' + uri + '\t' + count)

示例#7

0

显示文件

文件： generate_unambiguous_percentile_labels.py 项目： XI-lab/kilogram

def filter_labels(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = [(uri, int(count)) for uri, count in ListPacker.unpack(uri_list)]
    total = sum(zip(*uri_counts)[1])

    for uri, count in uri_counts:
        if count/total > args.percentile and count > args.min_count:
            print(label + '\t' + uri + '\t' + str(count))
            break

示例#8

0

显示文件

文件： generate_unambiguous_percentile_labels.py 项目： dragoon/kilogram

def filter_labels(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = [(uri, int(count))
                  for uri, count in ListPacker.unpack(uri_list)]
    total = sum(zip(*uri_counts)[1])

    for uri, count in uri_counts:
        if count / total > args.percentile and count > args.min_count:
            print(label + '\t' + uri + '\t' + str(count))
            break

示例#9

0

显示文件

文件： spark_predicted_label_counts.py 项目： XI-lab/kilogram

def generate_ngrams(line):
    labels = []
    line = line.strip()
    for sentence in line_filter(' '.join(tokenize_possessive(default_tokenize_func(line)))):
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> skip, could be wrong (Apple)
                    continue
                elif token in organic_label_dict:
                    labels.append(((token, organic_label_dict[token]), 1))
                    i = j-1
                    break
            i += 1
    return labels

示例#10

0

显示文件

文件： spark_predicted_label_counts.py 项目： dragoon/kilogram

def generate_ngrams(line):
    labels = []
    line = line.strip()
    for sentence in line_filter(' '.join(
            tokenize_possessive(default_tokenize_func(line)))):
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i + 20), i, -1):
                token = ' '.join(sentence[i:j])
                if i + 1 == j and i == 0:
                    # if first word in sentence -> skip, could be wrong (Apple)
                    continue
                elif token in organic_label_dict:
                    labels.append(((token, organic_label_dict[token]), 1))
                    i = j - 1
                    break
            i += 1
    return labels