Exemplo n.º 1
0
def mapper(jobid, filename, inputPath, topN, outputPath, model, table):

    # Read article
    article = projizz.jsonRead(os.path.join(inputPath, filename))

    stemmer = PorterStemmer()
    tks = {}

    print "Worker %d : Read %s into filter" % (jobid, filename)

    count = 0
    total = 0
    for line in article:
        count += 1
        tokens = projizz.getTokens(line)

        for token in tokens:
            t = stemmer.stem(token)

            if t not in tks:
                tks[t] = 0

            tks[t] += 1
            total += 1

        if count % 1000 == 0:
            print "worker %d done %d lines" % (jobid, count)

    # Remove stopwords
    for sw in projizz._stopwords:
        _sw = stemmer.stem(sw)
        if _sw in tks:
            total -= tks[_sw]
            tks.pop(_sw)

    needRemove = []
    maxTF = 1
    for t in tks:
        # ignore only one time word
        if tks[t] <= 1:
            needRemove.append(t)
            total -= tks[t]
            continue

        # ignore the case contain number
        if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t:
            needRemove.append(t)
            total -= tks[t]
            continue

        #if tks[t] > maxTF:
        #    maxTF = tks[t]

    for rm in needRemove:
        tks.pop(rm)

    projizz.jsonWrite(
        tks, os.path.join(outputPath, filename.replace(".json", ".tfc")))

    ### select top N words
    # sort by tfc
    sortedTks = sorted(tks.items(), key=lambda x: x[1], reverse=True)
    tks = {}
    maxTF = sortedTks[0][1]
    # Calculate tf
    top = 0
    for t, c in sortedTks:
        top += 1
        tks[t] = float(c) / float(maxTF)
        if top == topN:
            break

    projizz.jsonWrite(
        tks, os.path.join(outputPath, filename.replace(".json", ".tf")))
    print "worker %d write out." % (jobid)

    return (filename, tks)
Exemplo n.º 2
0
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append({"text": article[lineN], "label": "pos"})

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append({"text": article[lineN], "label": "neg"})

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return linesByRelations, linesNoRelaByRelations, POS, NEG
Exemplo n.º 3
0
 def tokenize(words):
     if isinstance(words, basestring):
         return projizz.getTokens(words)
     else:
         return (w for w in words)
Exemplo n.º 4
0
def mapper(jobid,filename,inputPath,outputPath,model,table):

    # Read article
    article = projizz.jsonRead( os.path.join(inputPath,filename) )

    stemmer = PorterStemmer()
    tks = {}

    print "Worker %d : Read %s into filter" % (jobid,filename)

    count = 0
    total = 0
    for line in article:
        count += 1
        tokens = projizz.getTokens(line)

        for token in tokens:
            t = stemmer.stem(token) 

            if t not in tks:
                tks[t] = 0

            tks[t] += 1
            total += 1

        if count % 1000 == 0:
            print "worker %d done %d lines" % (jobid,count)


    # Remove stopwords
    for sw in projizz._stopwords:
        _sw = stemmer.stem(sw)
        if _sw in tks:
            total -= tks[_sw]
            tks.pop(_sw)
        
    needRemove = []
    maxTF = 0
    for t in tks:
        # ignore only one time word
        if tks[t] <= 1:
            needRemove.append(t)
            total -= tks[t]
            continue

        # ignore the case contain number
        if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t:
            needRemove.append(t)
            total -= tks[t]
            continue

        if tks[t] > maxTF:
            maxTF = tks[t]

    for rm in needRemove:
        tks.pop(rm)

    projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tfc")))
    
    # Calculate tf
    for t in tks:
        tc = tks[t]
        tks[t] = float(tc)/float(maxTF)

    projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tf")))
    print "worker %d write out." % (jobid)

    return (filename,tks)
Exemplo n.º 5
0
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}


    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
       
                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)
                        
                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append( {"text":article[lineN],"label":"pos"} )

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append( {"text":article[lineN],"label":"neg"} )

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return linesByRelations,linesNoRelaByRelations,POS,NEG
Exemplo n.º 6
0
 def tokenize(words):
     if isinstance(words, basestring):
         return projizz.getTokens(words)
     else:
         return (w for w in words)