示例#1
0
    def iter_mentions(doc, window=1):
        sent_spans = list(iter_sent_spans(doc['text']))
        sent_offsets = [s.start for s in sent_spans]

        for link in doc['links']:
            # align the link span over sentence spans in the document
            # mention span may cross sentence bounds if sentence tokenisation is dodgy
            # if so, the entire span between bounding sentences will be used as context
            sent_start_idx = bisect_right(sent_offsets, link['start']) - 1
            sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1

            lhs_offset = window / 2
            rhs_offset = (window - lhs_offset) - 1
            sent_start_idx = max(0, sent_start_idx - lhs_offset)
            sent_end_idx = min(len(sent_spans) - 1, sent_end_idx + rhs_offset)
            sent_offset = sent_spans[sent_start_idx].start

            span = (link['start'] - sent_offset, link['stop'] - sent_offset)
            target = trim_link_subsection(link['target'])
            target = trim_link_protocol(target)
            mention = doc['text'][sent_spans[sent_start_idx].
                                  start:sent_spans[sent_end_idx].stop]

            # filter out instances where the mention span is the entire sentence
            if span == (0, len(mention)):
                continue

            # filter out list item sentences
            sm = mention.strip()
            if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'':
                continue

            yield target, doc['_id'], mention, span
示例#2
0
文件: text.py 项目: anukat2015/sift
    def iter_mentions(doc, window = 1):
        sent_spans = list(iter_sent_spans(doc['text']))
        sent_offsets = [s.start for s in sent_spans]

        for link in doc['links']:
            # align the link span over sentence spans in the document
            # mention span may cross sentence bounds if sentence tokenisation is dodgy
            # if so, the entire span between bounding sentences will be used as context
            sent_start_idx = bisect_right(sent_offsets, link['start']) - 1
            sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1

            lhs_offset = window / 2
            rhs_offset = (window - lhs_offset) - 1
            sent_start_idx = max(0, sent_start_idx - lhs_offset)
            sent_end_idx = min(len(sent_spans)-1, sent_end_idx + rhs_offset)
            sent_offset = sent_spans[sent_start_idx].start

            span = (link['start'] - sent_offset, link['stop'] - sent_offset)
            target = trim_link_subsection(link['target'])
            target = trim_link_protocol(target)
            mention = doc['text'][sent_spans[sent_start_idx].start:sent_spans[sent_end_idx].stop]

            # filter out instances where the mention span is the entire sentence
            if span == (0, len(mention)):
                continue
            # filter out list item sentences
            sm = mention.strip()
            if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'':
                continue

            yield target, (span, mention)
示例#3
0
文件: links.py 项目: anukat2015/sift
 def iter_unique_links(doc):
     links = set()
     for l in doc['links']:
         link = trim_link_subsection(l['target'])
         link = trim_link_protocol(link)
         if link not in links:
             yield link
             links.add(link)
示例#4
0
文件: links.py 项目: zisding/sift
 def iter_unique_links(doc):
     links = set()
     for l in doc['links']:
         link = trim_link_subsection(l['target'])
         link = trim_link_protocol(link)
         if link not in links:
             yield link
             links.add(link)
示例#5
0
文件: links.py 项目: anukat2015/sift
    def iter_anchor_target_pairs(self, doc):
        for link in doc['links']:
            target = link['target']
            target = trim_link_subsection(target)
            target = trim_link_protocol(target)

            anchor = doc['text'][link['start']:link['stop']].strip()

            if self.lowercase:
                anchor = anchor.lower()

            if anchor and target:
                yield anchor, target
示例#6
0
文件: links.py 项目: zisding/sift
    def iter_anchor_target_pairs(self, doc):
        for link in doc['links']:
            target = link['target']
            target = trim_link_subsection(target)
            target = trim_link_protocol(target)

            anchor = doc['text'][link['start']:link['stop']].strip()

            if self.lowercase:
                anchor = anchor.lower()

            if anchor and target:
                yield anchor, target
示例#7
0
    def iter_mentions(doc):
        sent_spans = list(iter_sent_spans(doc["text"]))
        sent_offsets = [s.start for s in sent_spans]

        for link in doc["links"]:
            # align the link span over sentence spans in the document
            sent_start_idx = bisect_right(sent_offsets, link["start"]) - 1
            sent_end_idx = bisect_left(sent_offsets, link["stop"]) - 1

            target = trim_link_subsection(link["target"])
            target = trim_link_protocol(target)

            # mention span may cross sentence bounds if sentence tokenisation is dodgy
            # if so, the entire span between bounding sentences will be used as context
            yield target, doc["text"][sent_spans[sent_start_idx].start : sent_spans[sent_end_idx].stop]
示例#8
0
 def iter_comentions(links):
     links = list(set(trim_link_protocol(trim_link_subsection(l['target'])) for l in links))
     for i in xrange(len(links)):
         yield links[i], Counter(links[:i] + links[i+1:])