Пример #1
0
def prunengrams(n, freqlist, simpleskipgrams):
    global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, MINTOKENS
    log("Pruning " + str(n) + "-grams...", stream=sys.stderr)
    for ngram, count in freqlist[n]:
        if count < MINTOKENS:
            del freqlist[n][ngram]  
            if DOINDEX: 
                del index[ngram]
            if DOSKIPGRAMS:
                skipgram = ( (ngram[0],) , (ngram[-1],) )
                if skipgram in simpleskipgrams[n] and simpleskipgrams[n][skipgram][None] <= count:
                    #note: if skip-grams are not found on the same n-level, they are pruned because of this early-pruning
                    del simpleskipgrams[n][skipgram]   
                    
    log("Retained " + str(len(freqlist[n])) +  " " + str(n) + "-grams after pruning", stream=sys.stderr)                    
Пример #2
0
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0):
    global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX, MINLENGTH
    log("Counting "+str(n)+"-grams ...", stream=sys.stderr)
    f.seek(0)
    gaps = list(consecutivegaps(n))
    for i, line in enumerate(f):
        if (i % 10000 == 0): 
            if linecount == 0:
                log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr)
            else:
                log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) 
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        else:
            line = [ x for x in line.strip().split(' ') if x ]
        for ngram in Windower(line,n):
            if DOCLASSER: ngram = tuple(classer.encodeseq(ngram))
            if n - 1 in freqlist:
                count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1])
            else:
                count = True
            if count:
                freqlist[n].count(ngram)
                if DOINDEX:
                    try:
                        index[ngram].add(i)
                    except KeyError:
                        index[ngram] = set((i,))
                if DOSKIPGRAMS and n >= 2 and ngram[0] != '<begin>' and ngram[-1] != '<end>':                    
                    for beginindex, length in gaps:
                        preskip = ngram[:beginindex]
                        postskip = ngram[beginindex+length:]                                                
                        if len(preskip) >= MINLENGTH and not (preskip in freqlist[len(preskip)]):
                            continue #this skip-gram isn't going to make it over the min threshold
                        if len(postskip) >= MINLENGTH and not (postskip in freqlist[len(postskip)]):
                            continue  #this skip-gram isn't going to make it over the min threshold
                    
                        skipgram = (preskip, postskip)                        
                        body = ngram[beginindex:beginindex+length]
                        if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later
                            simpleskipgrams[n][skipgram] = {None: 1}
                        else:
                            simpleskipgrams[n][skipgram][None] += 1
                        if body in simpleskipgrams[n][skipgram]:
                            if DOINDEX:
                                simpleskipgrams[n][skipgram][body].add(i)
                            else:
                                simpleskipgrams[n][skipgram][body] += 1
                        else:
                            if DOINDEX:
                                simpleskipgrams[n][skipgram][body] = set((i,))
                            else:
                                simpleskipgrams[n][skipgram][body] = 1
                    
    log("Found " + str(len(freqlist[n])) +  " " + str(n) + "-grams and " + str(len(simpleskipgrams[n])) + " skip-grams", stream=sys.stderr)                    
    return i+1
Пример #3
0
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0):
    global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX
    log("Counting "+str(n)+"-grams ...", stream=sys.stderr)
    f.seek(0)
    for i, line in enumerate(f):
        if (i % 10000 == 0): 
            if linecount == 0:
                log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr)
            else:
                log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) 
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        else:
            line = [ x for x in line.strip().split(' ') if x ]
        for ngram in Windower(line,n):
            if DOCLASSER: ngram = tuple(classer.encodeseq(ngram))
            if n - 1 in freqlist:
                count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1])
            else:
                count = True
            if count:
                freqlist[n].count(ngram)
                if DOINDEX:
                    try:
                        index[ngram].add(i)
                    except KeyError:
                        index[ngram] = set((i,))
                if DOSKIPGRAMS and n >= 3 and ngram[0] != '<begin>' and ngram[-1] != '<end>':
                    skipgram =  ( (ngram[0],) , (ngram[-1],) )
                    body = tuple(ngram[1:-1])
                    if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later
                        simpleskipgrams[n][skipgram] = {None: 1}
                    else:
                        simpleskipgrams[n][skipgram][None] += 1
                    if body in simpleskipgrams[n][skipgram]:
                        if DOINDEX:
                            simpleskipgrams[n][skipgram][body].add(i)
                        else:
                            simpleskipgrams[n][skipgram][body] += 1
                    else:
                        if DOINDEX:
                            simpleskipgrams[n][skipgram][body] = set((i,))
                        else:
                            simpleskipgrams[n][skipgram][body] = 1
                    
                    #simpleskipgrams[n].count( skipgram )                     
                    #try:
                    #    skips[skipgram].append( ngram[1:-1] )
                    #except:
                    #    skips[skipgram] = [ ngram[1:-1] ]
    log("Found " + str(len(freqlist[n])) +  " " + str(n) + "-grams", stream=sys.stderr)                    
    return i+1
Пример #4
0
def pruneskipgrams(n, simpleskipgrams, skips):
        global MINSKIPTYPES, MINSKIPGRAMTOKENS, MINSKIPTOKENS
        l = len(simpleskipgrams[n])
        log("Pruning skip-" + str(n) + "-grams... (" +str(l)+")", stream=sys.stderr)
        for i, (skipgram, data) in enumerate(simpleskipgrams[n].items()):
            if i % 100000 == 0:  log('\t\t@' + str(i),stream=sys.stderr)
            typecount = len(data) - 1 #Minus the meta None/count entry
            prune = False
            if typecount < MINSKIPTYPES or data[None] < MINSKIPGRAMTOKENS:
                prune = True
            else:
                cacheditems = data.items()
                modified = False
                for skip,data2 in list(data.items()):
                    if skip:
                        if DOINDEX:
                            count = len(data2)
                        else:
                            count = data2
                        if count < MINSKIPTOKENS:
                            modified = True
                            #prune this skip-content only
                            simpleskipgrams[n][skipgram][None] -= count
                            del simpleskipgrams[n][skipgram][skip] 
                del cacheditems
                
                if modified:
                    #recompute, things have changed
                    typecount = len(simpleskipgrams[n][skipgram]) - 1 #Minus the meta None/count entry
                    if typecount < MINSKIPTYPES or simpleskipgrams[n][skipgram][None] < MINSKIPGRAMTOKENS:
                        prune = True

            if prune:
                del simpleskipgrams[n][skipgram]
        log("\t" +str(len(simpleskipgrams[n])) + " left after pruning",stream=sys.stderr)
Пример #5
0
def buildclasser():
    global DOTOKENIZE, ENCODING, outputprefix
    log("Counting unigrams (for classer) ...",stream=sys.stderr)
    freqlist = FrequencyList()
    f = open(corpusfile)
    for i, line in enumerate(f):            
        if (i % 10000 == 0): 
            log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr)
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        line = line.strip().split(' ')
        freqlist.append(['<begin>'] + line + ['<end>'])
    f.close()
    
    log("Building classer ...", stream=sys.stderr)
    classer = Classer(freqlist)
    classer.save(outputprefix + '.cls')
    log("\t" + str(len(classer)) + " classes found", stream=sys.stderr)
    return classer    
Пример #6
0
def buildcompgraph(freqlist):
    compgraph = DiGraph()
    for n in freqlist:
        log("Computing compositionality graph (processing " +str(n) + "-grams)", stream=sys.stderr)
        l = len(freqlist[n])
        for i, (ngram, count) in enumerate(freqlist[n]):
            if (i % 10000 == 0): 
                log('\t' + str(float(round((i/float(l))*100,2))) + '%',stream=sys.stderr)
            for n2 in range(MINLENGTH,n):
                for subngram in Windower(ngram,n2):
                    if subngram in freqlist[n2]:
                        compgraph.add_edge(subngram, ngram)        

    log("Writing compositionality graph to file", stream=sys.stderr)

    write_gpickle(compgraph, outputprefix + '.compgraph')
    return compgraph
Пример #7
0
def expandskipgrams(n, simpleskipgrams, skips): #OLD: OBSOLETE
    log("Expanding skip-" + str(n) + "-grams...",stream=sys.stderr)
    cacheitems = list(simpleskipgrams[n].items())
    expansionsize = 0
    for p, (skipgram, data) in enumerate(cacheitems):
        if p % 1000 == 0:  log( '\t\t@' + str(p) + ' - ' + str(expansionsize) + ' new skip-grams thus-far',stream=sys.stderr)
        if len(data) ** 2 >= 1000000:
            log( '\t\t\t@' + str(p) + ' -- ' + str(len(data)**2) + ' comparisons',stream=sys.stderr)

        processed = {}
        skipdata = data.items()
        for skip, skipindex in skipdata:            
            if skip:
                for skip2, skipindex2 in skipdata:                        
                    if skip != skip2 and skip2 and not (skip2,skip) in processed:
                        processed[(skip,skip2)] = True
                        left = []
                        right = []
                        position = 0
                        consecutive = True
                        gap = 0
                        prev = None
                        gapbegin = 0
                        gapsize = 1
                        for i in xrange(0,len(skip)):
                            w = skip[i]
                            if w == skip2[i]:
                                if position == 0:
                                    left.append(w)
                                elif position == 1:
                                    right.append(w)                                    
                            else:
                                if position == 0:
                                    gapbegin = i
                                    position = 1
                                elif position == 1 and prev:
                                    #multiple gaps
                                    consecutive = False    
                                    break
                                else:
                                    gapsize += 1
                            prev = w
                            
                        if not consecutive: continue
                        
                        #content of new gap
                        newskip = skip2[gapbegin:gapbegin+gapsize]
                
                        newskipgram = ( skipgram[0] + tuple(left), tuple(right) + skipgram[-1] )
                        
                        if DOINDEX:
                            newskipindex = skipindex | skipindex2 #a union set
                        else:
                            newskipindex = skipindex + skipindex2 #a count (int)
                            
                        
                        try:
                            simpleskipgrams[n][newskipgram][None] += newskipindex
                        except:
                            simpleskipgrams[n][newskipgram] = {None: newskipindex}
                            expansionsize += 1
                        try:                            
                            if DOINDEX:
                                simpleskipgrams[n][newskipgram][newskip].update(newskipindex)
                            else:
                                simpleskipgrams[n][newskipgram][newskip] += newskipindex
                        except:
                            simpleskipgrams[n][newskipgram][newskip] = newskipindex
                                
        if len(data) ** 2 >= 1000000:
            log( '\t\t\t(next)',stream=sys.stderr)               

    log("Found " + str(len(simpleskipgrams[n])) + " skip-" + str(n) + "-grams, of which "+str(expansionsize) + " from expansion step)", stream=sys.stderr)
Пример #8
0
        #        
        #        pruneskipgrams(n, simpleskipgrams, skips)
        
            
    
if DOCOMPOSITIONALITY:
    compgraph = buildcompgraph(freqlist)
    



totalcount = 0
for n in freqlist:
    totalcount += sum([ f for f in freqlist[n].values() ])
            
log("Writing n-grams to file", stream=sys.stderr)

f = open(outputprefix + '.phraselist', 'w')
f.write('#N\tN-GRAM\tOCCURRENCE-COUNT\tNORMALISED-IN-NGRAM-CLASS\tNORMALISED-OVER-ALL\tSUBCOUNT\tSUPERCOUNT\n')
for n in freqlist:
    for ngram, count in freqlist[n]:
        if DOCLASSER:
            ngram_s = " ".join(classer.decodeseq(ngram))        
        else:
            ngram_s = " ".join(ngram)        
        if DOCOMPOSITIONALITY:
            subcount = str(len(compgraph.out_edges(ngram)))
            supercount = str(len(compgraph.in_edges(ngram)))
        else:
            subcount = '-'
            supercount = '-'