def test():
    language = sys.argv[1]
    CHUNK_SIZE = 10
    cursor = Documents.conn.cursor()
    while True:
        documents = MDocuments()
        LIMIT = CHUNK_SIZE * 5
        documents.get_multi(limit = LIMIT,
                where="""id IN (SELECT id FROM documents_to_affected 
                                WHERE language='%s' LIMIT %d)""" % (language, LIMIT))
        docs = [Documents(**d) for d in documents.value()]
        if len(docs) == 0:
            print "NO DOCUMENTS!\nSLEEPING."
            timer.sleep_minute( 120 )
            continue
        c = 0
        for documents in chunks( docs, CHUNK_SIZE ):
            affect_docs( documents, language, cursor )
Exemplo n.º 2
0
def test():
    language = sys.argv[1]
    CHUNK_SIZE = 10
    cursor = Documents.conn.cursor()
    while True:
        documents = MDocuments()
        LIMIT = CHUNK_SIZE * 5
        documents.get_multi(
            limit=LIMIT,
            where="""id IN (SELECT id FROM documents_to_affected 
                                WHERE language='%s' LIMIT %d)"""
            % (language, LIMIT),
        )
        docs = [Documents(**d) for d in documents.value()]
        if len(docs) == 0:
            print "NO DOCUMENTS!\nSLEEPING."
            timer.sleep_minute(120)
            continue
        c = 0
        for documents in chunks(docs, CHUNK_SIZE):
            affect_docs(documents, language, cursor)
Exemplo n.º 3
0
    conn = get_connection(UNICODE=True)
    # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    print "Analyzing whole database"
    sys.stdout.flush()
    while True:
        curr = conn.cursor()
        curr.execute(
            """SELECT id, language, text FROM documents WHERE termvector is null and language in ('en', 'de') 
                and pubdate>='01-05-2011' and pubdate<'01-07-2011' 
                LIMIT %s""",
            (LIMIT,),
        )
        # curr.execute("""SELECT id, text FROM documents WHERE id=20875243""")
        count = 0
        for document in curr:
            id, lang, text = document
            print "id=%s" % id
            termvector = get_termvector(text, lang, conn)
            insertcurr = conn.cursor()
            insertcurr.execute(
                """UPDATE documents
                    SET termvector=%s
                    WHERE id=%s""",
                (termvector, id),
            )
            count += 1
            sys.stdout.flush()
        if count == 0:
            print "SLEEPING"
            timer.sleep_minute(60)
Exemplo n.º 4
0
def main():
    conn = get_connection(UNICODE=True)
    curr = conn.cursor()
    tokenizer = TreebankWordTokenizer()

    while True:
        curr.execute("""SELECT id, text, language FROM documents 
                WHERE
                --guid='tw:122144569302323201'
                EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
                LIMIT 1""")
        data = curr.fetchone()
        if data is None:
            print "sleep"
            timer.sleep_minute(30)
            continue
        id, text, lang = data
        print "id", id
        curr.execute("""SELECT * FROM instances
                WHERE item_id = %s
                AND begintoken IS NULL""", (id,))
        # throw away `confidence`
        instances = [list(x)[:-1] for x in curr]
        if not len(instances):
            continue
        instance_ = []
        for ins in instances:
            ins[-1] = None
            ins[-2] = None
            ins[-3] = None
            instance_.append(ins)
        instances = instance_
        #print instances

        sent_tok = PunktSentenceTokenizer()

        for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
            #print '++++'
            sentence = text[sentidx[0]:sentidx[1]]
            #print sentence
            #print '----'
            for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)):
                # TODO indexy jsou pouze relativni k vete
                # ale instances je ma od zacatku!
                indexes = list(indexes)
                indexes[0] = sentidx[0] + indexes[0]
                indexes[1] = sentidx[0] + indexes[1]
                word = text[indexes[0]:indexes[1]]
                #print pos, word, indexes

                for i, instance in enumerate(instances):
                    id, entity_id, item_id, exact, offset, length, sid_, begin, end  =instance
                    #print i,instance
                    if sid_ is None:
                        if begin is None:
                            if offset >= indexes[0] and offset <= indexes[1]:
                                instances[i][-2] = begin = pos
                                instances[i][-3] = sid_ = sid
                    if sid_ == sid:
                        if end is None and begin is not None:
                            off = offset + length
                            if off <= indexes[1] and off >= indexes[0]:
                                instances[i][-1] = pos
                                if off == indexes[0]:
                                    instances[i][-1] = pos - 1
        for instance in instances:
            print instance
            id, entity_id, item_id, exact, offset, length, sid, begin, end =instance
            #print exact, ">>", sid, begin, end
            if end is None:
                if not " " in exact:
                    end = begin
                else:
                    end = -1
            curr.execute("""UPDATE instances
                    SET sid=%s, begintoken=%s, endtoken=%s
                    WHERE id=%s""", (sid, begin, end, id))
def main():
    conn = get_connection(UNICODE=True)
    curr = conn.cursor()
    tokenizer = TreebankWordTokenizer()

    while True:
        curr.execute("""SELECT id, text, language FROM documents 
                WHERE
                --guid='tw:122144569302323201'
                EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
                LIMIT 1""")
        data = curr.fetchone()
        if data is None:
            print "sleep"
            timer.sleep_minute(30)
            continue
        id, text, lang = data
        print "id", id
        curr.execute(
            """SELECT * FROM instances
                WHERE item_id = %s
                AND begintoken IS NULL""", (id, ))
        # throw away `confidence`
        instances = [list(x)[:-1] for x in curr]
        if not len(instances):
            continue
        instance_ = []
        for ins in instances:
            ins[-1] = None
            ins[-2] = None
            ins[-3] = None
            instance_.append(ins)
        instances = instance_
        #print instances

        sent_tok = PunktSentenceTokenizer()

        for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
            #print '++++'
            sentence = text[sentidx[0]:sentidx[1]]
            #print sentence
            #print '----'
            for pos, indexes in enumerate(
                    WhitespaceTokenizer().span_tokenize(sentence)):
                # TODO indexy jsou pouze relativni k vete
                # ale instances je ma od zacatku!
                indexes = list(indexes)
                indexes[0] = sentidx[0] + indexes[0]
                indexes[1] = sentidx[0] + indexes[1]
                word = text[indexes[0]:indexes[1]]
                #print pos, word, indexes

                for i, instance in enumerate(instances):
                    id, entity_id, item_id, exact, offset, length, sid_, begin, end = instance
                    #print i,instance
                    if sid_ is None:
                        if begin is None:
                            if offset >= indexes[0] and offset <= indexes[1]:
                                instances[i][-2] = begin = pos
                                instances[i][-3] = sid_ = sid
                    if sid_ == sid:
                        if end is None and begin is not None:
                            off = offset + length
                            if off <= indexes[1] and off >= indexes[0]:
                                instances[i][-1] = pos
                                if off == indexes[0]:
                                    instances[i][-1] = pos - 1
        for instance in instances:
            print instance
            id, entity_id, item_id, exact, offset, length, sid, begin, end = instance
            #print exact, ">>", sid, begin, end
            if end is None:
                if not " " in exact:
                    end = begin
                else:
                    end = -1
            curr.execute(
                """UPDATE instances
                    SET sid=%s, begintoken=%s, endtoken=%s
                    WHERE id=%s""", (sid, begin, end, id))
Exemplo n.º 6
0
if __name__ == "__main__":
    LIMIT = 20
    conn = get_connection(UNICODE=True)
    #conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    print "Analyzing whole database"
    sys.stdout.flush()
    while True:
        curr = conn.cursor()
        curr.execute("""SELECT id, language, text FROM documents WHERE termvector is null and language in ('en', 'de') 
                and pubdate>='01-05-2011' and pubdate<'01-07-2011' 
                LIMIT %s""", (LIMIT ,))
        #curr.execute("""SELECT id, text FROM documents WHERE id=20875243""")
        count = 0
        for document in curr:
            id, lang, text = document
            print "id=%s" % id
            termvector = get_termvector(text, lang, conn)
            insertcurr = conn.cursor()
            insertcurr.execute("""UPDATE documents
                    SET termvector=%s
                    WHERE id=%s""", ( termvector, id ))
            count += 1
            sys.stdout.flush()
        if count == 0:
            print "SLEEPING"
            timer.sleep_minute(60)