def generate_ngrams(line): result = [] line = line.strip() for sentence in line_filter(' '.join(default_tokenize_func(line))): tokens_plain = [] sentence = sentence.split() i = 0 while i < len(sentence): for j in range(min(len(sentence), i + 20), i, -1): token = ' '.join(sentence[i:j]) if i + 1 == j and i == 0: # if first word in sentence -> do not attempt to link, could be wrong (Apple) tokens_plain.append(token.lower()) elif token in unambiguous_labels: # TODO: check it doesn't span titles uri = unambiguous_labels[token] # get types tokens_plain.append('<dbpedia:' + uri + '>') i = j - 1 break i += 1 for n in range(1, N + 1): for ngram in nltk.ngrams(tokens_plain, n): result.append((' '.join(ngram), 1)) return result
def generate_ngrams(line): result = [] line = line.strip() for sentence in line_filter(' '.join(default_tokenize_func(line))): tokens_plain = [] sentence = sentence.split() i = 0 while i < len(sentence): for j in range(min(len(sentence), i+20), i, -1): token = ' '.join(sentence[i:j]) if i+1 == j and i == 0: # if first word in sentence -> do not attempt to link, could be wrong (Apple) tokens_plain.append(token.lower()) elif token in unambiguous_labels: # TODO: check it doesn't span titles uri = unambiguous_labels[token] # get types tokens_plain.append('<dbpedia:'+uri+'>') i = j-1 break i += 1 for n in range(1, N+1): for ngram in nltk.ngrams(tokens_plain, n): result.append((' '.join(ngram), 1)) return result
def unpack_achors(line): label, uri_list = line.split('\t') # tokenize for commas label = ' '.join(tokenize_possessive(default_tokenize_func(label))) # should be only one uri_counts = ListPacker.unpack(uri_list) total_count = sum(int(c) for _, c in uri_counts) print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))
def link(sentence): tokens = default_tokenize_func(sentence) pos_tokens = nltk.pos_tag(tokens) candidates = extract_candidates(pos_tokens) if len(candidates) > 0: graph = SemanticGraph(candidates) graph.do_iterative_removal() graph.do_linking() return candidates
def unpack_achors(line): label, uri_list = line.split('\t') # tokenize for commas label = ' '.join(tokenize_possessive(default_tokenize_func(label))) # should be only one uri_counts = ListPacker.unpack(uri_list) if len(uri_counts) > 1: return uri, count = uri_counts[0] print(label + '\t' + uri + '\t' + count)
def filter_labels(line): label, uri_list = line.split('\t') # tokenize for commas label = ' '.join(tokenize_possessive(default_tokenize_func(label))) # should be only one uri_counts = [(uri, int(count)) for uri, count in ListPacker.unpack(uri_list)] total = sum(zip(*uri_counts)[1]) for uri, count in uri_counts: if count/total > args.percentile and count > args.min_count: print(label + '\t' + uri + '\t' + str(count)) break
def filter_labels(line): label, uri_list = line.split('\t') # tokenize for commas label = ' '.join(tokenize_possessive(default_tokenize_func(label))) # should be only one uri_counts = [(uri, int(count)) for uri, count in ListPacker.unpack(uri_list)] total = sum(zip(*uri_counts)[1]) for uri, count in uri_counts: if count / total > args.percentile and count > args.min_count: print(label + '\t' + uri + '\t' + str(count)) break
def generate_ngrams(line): labels = [] line = line.strip() for sentence in line_filter(' '.join(tokenize_possessive(default_tokenize_func(line)))): sentence = sentence.split() i = 0 while i < len(sentence): for j in range(min(len(sentence), i+20), i, -1): token = ' '.join(sentence[i:j]) if i+1 == j and i == 0: # if first word in sentence -> skip, could be wrong (Apple) continue elif token in organic_label_dict: labels.append(((token, organic_label_dict[token]), 1)) i = j-1 break i += 1 return labels
def generate_ngrams(line): labels = [] line = line.strip() for sentence in line_filter(' '.join( tokenize_possessive(default_tokenize_func(line)))): sentence = sentence.split() i = 0 while i < len(sentence): for j in range(min(len(sentence), i + 20), i, -1): token = ' '.join(sentence[i:j]) if i + 1 == j and i == 0: # if first word in sentence -> skip, could be wrong (Apple) continue elif token in organic_label_dict: labels.append(((token, organic_label_dict[token]), 1)) i = j - 1 break i += 1 return labels