예제 #1
0
def modified_process(filename):
    ''' Serial processing of abstracts, no topic modeling. '''
    abstracts = []
    dictionary = []

    # load stop words
    stops = set()
    stop_file = 'stopwords.txt'
    with open(stop_file, 'rU') as stopFile:
        for row in stopFile.readlines():
            stops.add(row.replace('\n', ''))
    
    dictlist = Process.load(filename, abstracts, stops) 
    # create dictionary
    Process.create_dict(dictlist, dictionary)

    # clean text of words not in dictionary
    for abstract in abstracts:
        abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
        abstract.Set('cleantext', abstext)

    dictlength = len(dictionary) 
    bigramdict = []
    termbow = defaultdict(float)
    termbigram = defaultdict(float)
    for abstract in abstracts:
        # create dict of word frequency (bag of words)
        bow = Process.create_bagofwords(abstract, dictionary)
        abstract.Set('bow', bow)
        abstract.Set('bownum', dictlength)
        for ind in bow.keys():
            termbow[ind] += 1.0
        # create dict of bigram frequency
        bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict)
        abstract.Set('bigram', bigram)
        for pair in bigram.keys():
            termbigram[pair] += 1.0
    # create dict of tfidf
    Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict))
    Process.serial_tfidf(abstracts, 'bigram', termbigram)

    return abstracts
예제 #2
0
import report
import process
from threading import Thread
import numpy as np
from queue import Queue

images = process.load("data")
#images = list(filter(lambda i: i[2] == "22276.png" or i[2] == "10635.png" or i[2] == "15055.png" or i[2] == "input_7_1.png" or i[2] == "input_2404_1.png" ,images))
images = list(filter(lambda i: i[0] != "unknown", images))

classified = []
threads = []

q = Queue(maxsize=0)
num_theads = 20


def thread(_q):
    while not _q.empty():
        image = _q.get()
        classified.append(process.classify(image))
        _q.task_done()


for i in images:
    q.put(i)

for t in range(num_theads):
    t = Thread(target=thread, args=(q, ))
    t.start()
예제 #3
0
파일: cluster.py 프로젝트: wsun/abstracts
def process(filename):
    ''' Serial processing of abstracts, for evaluation purposes. '''
    
    abstracts = []
    dictionary = []

    # load stop words
    stops = set()
    stop_file = 'stopwords.txt'
    with open(stop_file, 'rU') as stopFile:
        for row in stopFile.readlines():
            stops.add(row.replace('\n', ''))
        
    dictlist = Process.load(filename, abstracts, stops) 
    # create dictionary
    Process.create_dict(dictlist, dictionary)

    # clean text of words not in dictionary
    for abstract in abstracts:
        abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
        abstract.Set('cleantext', abstext)

    dictlength = len(dictionary) 
    bigramdict = []
    termbow = defaultdict(float)
    termbigram = defaultdict(float)
    for abstract in abstracts:
        # create dict of word frequency (bag of words)
        bow = Process.create_bagofwords(abstract, dictionary)
        abstract.Set('bow', bow)
        abstract.Set('bownum', dictlength)
        for ind in bow.keys():
            termbow[ind] += 1.0
        # create dict of bigram frequency
        bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict)
        abstract.Set('bigram', bigram)
        for pair in bigram.keys():
            termbigram[pair] += 1.0
    # create dict of tfidf
    Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict))
    Process.serial_tfidf(abstracts, 'bigram', termbigram)

    ##### TOPICS
    # prepare dictionary and corpora for topic modeling
    docs = [abstract.Get('cleantext') for abstract in abstracts]
    dictionary = corpora.Dictionary(docs)
    dictionary.save('abstracts.dict')           
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize('abstracts.mm', corpus)

    # use gensim tfidf to transform
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # load lsa and lda models
    numtopics = 10  # this can be adjusted
    lsaModel = Lsa.serial(corpus_tfidf, dictionary, numtopics)
    ldaModel = Lda.serial(corpus_tfidf, dictionary, numtopics, alpha=50.0/numtopics, eta=2.0/numtopics)

    # store lda and lsa representation in all abstracts
    for i in xrange(len(abstracts)):
        lsaVec = lsaModel[tfidf[corpus[i]]]
        ldaVec = ldaModel[tfidf[corpus[i]]]
        lsaVector = defaultdict(float)
        ldaVector = defaultdict(float)
        for v in lsaVec:
            lsaVector[v[0]] = v[1]
        for v in ldaVec:
            ldaVector[v[0]] = v[1]
        abstracts[i].Set('lsa', lsaVector)
        abstracts[i].Set('lda', ldaVector)
        abstracts[i].Set('numtopics', numtopics)
    
    return abstracts
예제 #4
0
    # Serial testing
    if rank == 0:
        print "Serial testing ..."
        abstracts = []
        dictionary = []

        # load stop words
        stops = set()
        stop_file = 'stopwords.txt'
        with open(stop_file, 'rU') as stopFile:
            for row in stopFile.readlines():
                stops.add(row.replace('\n', ''))

        sloadstart = time.time()
        dictlist = Process.load(filename, abstracts, stops) 
        sloadend = time.time()

        # create dictionary
        sdictstart = time.time()
        Process.create_dict(dictlist, dictionary)
        sdictend = time.time()

        # clean text of words not in dictionary
        scleanstart = time.time()
        for abstract in abstracts:
            abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
            abstract.Set('cleantext', abstext)
        scleanend = time.time()

        sfreqstart = time.time()
예제 #5
0
파일: main.py 프로젝트: w007878/dmhw
            num_clu[blg[i]] += 1

        cent = np.zeros([k, dim], np.float32)

        for i in xrange(n):
            cent[blg[i]] = cent[blg[i]] + data[i] / num_clu[blg[i]]

        for _k in xrange(k):
            if num_clu[_k] == 0:
                cent[_k] = np.asarray([random_sample(dim)], np.float32)

    return blg


if __name__ == '__main__':
    data, f = load('sample/')
    n = data.shape[0]
    print f

    k = 5
    res = kmeans(data, k=k, itr=10)

    os.system('rm -r res/*')

    for i in xrange(k):
        os.system('mkdir res/' + str(i))

    for i in xrange(n):
        os.system('cp sample/' + f[i] + ' res/' + str(res[i]))

    print res