def main(args):
     """ Read and write data into remote redis server """
     from utils import charset_wrapper
     import redis
     r = redis.StrictRedis(host='172.18.28.118')
     syslog = xuxian.log.system_logger
     syslog.info('loading wikiline2entity file....')
     build_entity_wikilink_map(charset_wrapper(open(args.mid2wiki)), r)
     syslog.info('loading redirect file....')
     build_redirect_wikilink_map(charset_wrapper(open(args.redirect)), r)
     syslog.info('finished init global object')
Exemplo n.º 2
0
def main(args):
    recovery_state = xuxian.recall(args.task_id)
    syslog = xuxian.log.system_logger
    Dict = xuxian.log.LogDict

    # init global object
    docs = wikiobj_to_doc(charset_wrapper(open(args.wiki_file)))
    nlp = init_corenlp(args.nlp_server, args.nlp_port)
    robj = redis.StrictRedis(host=args.redis_server, port=args.redis_port,
            db=args.redis_db)

    # init output dump file
    entity_outfile = xuxian.apply_dump_file('entity',
            args.single_entity_output_file)
    entity_pair_outfile = xuxian.apply_dump_file('entity-pair',
            args.entity_pair_output_file)

    # iterate over data input
    for doc in docs:
        syslog.info('to process doc_title=' + doc['title'].encode('utf-8'))
        for (lineno, line) in enumerate(doc['text']):
            # at the correct time point, clear the recovery state
            if recovery_state == doc['title'] + str(lineno):
                recovery_state = None
            if recovery_state is not None:
                continue

            # every line is a paragraph in wikipedia
            line = line.rstrip()
            if not line:
                continue

            plaintext = get_plain_text(line)
            mentions = get_plain_text_mention_info(line)
            syslog.debug(Dict({'plaintext' : plaintext[:100].encode('utf-8'),
                'mention':str(mentions)}))

            depparsed_output = depparse_paragraph(plaintext, nlp)
            if u'sentences' not in depparsed_output:
                # TODO: empty ?
                continue

            sentences = depparsed_output[u'sentences']

            syslog.debug('to process doc_title=' + doc['title'].encode('utf-8')
                    + '\tdoc_line=' + plaintext[:80].encode('utf-8'))
            process_paragraph_single_entity(sentences, mentions,
                    robj, entity_outfile)

            process_paragraph_multiple_entity(sentences, mentions,
                    robj, entity_pair_outfile)

            xuxian.remember(args.task_id, (doc['title'] + unicode(lineno)).encode('utf-8'))
    def test(args):
        """ check for all key if they still exist in remote redis server """
        from utils import charset_wrapper
        import redis
        r = redis.StrictRedis(host='172.18.28.118')
        syslog = xuxian.log.system_logger

        for entity, wikilink in (line.strip().split('\t') for line in charset_wrapper(open(args.mid2wiki))):
            wikilink = upper_first_letter(wikilink)
            res = r.get('wiki2mid' + wikilink.encode('utf-8'))
            if res != entity.encode('utf-8'):
                syslog.info((u'err_wiki2mid\tkey=' + wikilink + u'\tval=' + entity).encode('utf-8'))
            else:
                syslog.info('ok_wiki2mid\tkey=' + wikilink.encode('utf-8'))

        for link1, link2 in (line.rstrip().split('\t') for line in charset_wrapper(open(args.redirect))):
            link1, link2 = upper_first_letter(link1), upper_first_letter(link2)

            res = r.get('redir' + link1.encode('utf-8'))
            if res != link2.encode('utf-8'):
                syslog.info((u'err_redir\tkey=' + link1 + u'\tval=' + link2).encode('utf-8'))
            else:
                syslog.info('ok_redir\tkey=' + link1.encode('utf-8'))
def build_key_properties_table(event_schema_file):
    """
    Read the schema file, return a object as {event_type: set_of_key_properties}
    """
    keytable, evtype, event = {}, None, set()
    for line in charset_wrapper(open(event_schema_file)):
        line = line.rstrip()

        if line.startswith(u'\t'):
            event.add(line.lstrip())
        else:
            if evtype is not None:
                keytable[evtype] = event
            evtype = line
            event = set()

    keytable[evtype] = event
    return keytable
def find_context_sentence_for_events(robj, outfile, event_schema,
                                     string_to_mid, sentence_entity_file):
    evtypes = event_schema.keys()
    for sentence, mentions in sentence_reader(
            charset_wrapper(open(sentence_entity_file))):

        mention_pos = dict((m[0], (m[1], m[2])) for m in mentions)

        rkeys = [
            make_rkey(evtype, (m1[0], m2[0]))
            for evtype, m1, m2 in enumerate_rkeys(evtypes, mentions)
        ]
        for rkey in rkeys:
            data = robj.get(rkey)
            if data is None:
                continue

            evdata = json.loads(data)
            output_ev_context(outfile, sentence, mention_pos, evdata)
Exemplo n.º 6
0
                doc_obj['url'] = m.group(4)
                doc_obj['title'] = m.group(6)
                doc_obj['text'] = []
                continue

        if not state_out_of_doc and DOC_END_PATTERN.match(line):
            state_out_of_doc = True
            yield doc_obj
            continue

        if not state_out_of_doc:
            doc_obj['text'].append(line)
            continue

        raise ValueError('failed with: state_out_of_doc=%s line=%s' %
                         (state_out_of_doc, line))


if __name__ == "__main__":
    import sys
    from utils import charset_wrapper

    class PrettyDoc(dict):
        def __str__(self):
            return ("id:\t" + str(self['id']) + "\n" + "url:\t" +
                    str(self['url']) + "\n" + "title:\t" + str(self['title']) +
                    "\n" + "\n".join(self['text'][:5]))

    doc = wikiobj_to_doc(charset_wrapper(open(sys.argv[1], 'r'))).next()
    print PrettyDoc(doc)
def build_string_mid_table(mid_to_entity_file):
    """ Turn a tab-separated file <mid, string> into a dict <string, mid> """
    return dict((s.strip(), m) for (m, s) in (
        line.rstrip().split('\t')
        for line in charset_wrapper(open(mid_to_entity_file, 'r'))))
Exemplo n.º 8
0
        mention_len = pat_end - pat_start
        cursor += mention_len
        plain_mention_cursor += len(mention_repr)

    # no more mention in tail, cursor info is trival now

    return plain_mentions

if __name__ == "__main__":
    from utils import charset_wrapper
    from wiki_doc import wikiobj_to_doc
    from entity_mentions import get_entity_mentions, get_entity_mentions_in_lines
    from entity_mentions import get_plain_text, get_plain_text_mention_info
    import sys

    doc = wikiobj_to_doc(charset_wrapper(open(sys.argv[1], 'r'))).next()

    # unit test for get_entity_mentions and get_entity_mentions_in_lines
    print "\n".join("%3d" % i + "\t" + str(mention) + "\t====>\t" + 
            ", ".join(x.encode('utf-8') for x in (mention[2], mention[3]))
            for (i, mention) in get_entity_mentions_in_lines(doc['text']))

    # unit test for get_plain_text and get_plain_text_mention_info
    for line in doc['text']:
        plain_line = get_plain_text(line)
        mentions = get_plain_text_mention_info(line)

        if not mentions:
            continue

        print "\nplain_line:\t" + plain_line.encode('utf-8').rstrip()