Пример #1
0
def jiebaCounter(max_features=5000,prefix="extraction-",begin=1, end=1,dictionary=""):
    # get stopwords
    sf = open('chi_,.txt','r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    if dictionary=="":
        vectorizer=cv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    else:
        vocabulary=open(dictionary,'r').read().split("\n")
        vectorizer=cv(vocabulary=vocabulary,max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print getdatatime-st
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    vect=vectorizer.fit_transform(corpus.values()).toarray()
    print vect.shape
    voc=vectorizer.get_feature_names()
    wordssum = vect.sum(axis=0)
    index=range(len(voc))
    index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x not in stopwords]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    print time.time()-getdatatime
    return vect,voc,txt
Пример #2
0
def tfidf(max_features=5000,prefix="extraction-",begin=1, end=26):
    # get stopwords
    sf = open('chi_stopwords.txt','r')
    stopwords = [x.strip() for x in sf.read().split(',')]
    vectorizer=tv(max_features=max_features)#tokenizer=tokenizer)
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print getdatatime-st
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    tfidf=vectorizer.fit_transform(corpus.values()).toarray()
    print tfidf.shape
    voc=vectorizer.get_feature_names()
    wordssum = tfidf.sum(axis=0)
    index=range(len(voc))
    index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] 
    print time.time() - st
    voc_sorted = [voc[i] for i in index] 
    tfidfret = []
    print time.time()-getdatatime
    return tfidf,voc,txt
Пример #3
0
def jiebaCount(max_features=5000,prefix="extraction-",begin=1, end=1):
    sf=open('chi_,.txt','r')
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    print "Data loaded."
    res=[]
    count=[]
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    sw=stopwords
    print len(txt)
    st=time.time() 
    for i in range(len(txt)):
        r=" ".join(jieba.cut(txt[i])).split(" ")
        r=[x.strip() for x in r]
        r=filter(None,r)
        r=[x for x in r if not x in stopwords]
        res.append(r)
        count.append(Counter(r))
        #print i
    print "Counting cost "+str(time.time()-st)+" seconds."
    #sw = [x.strip() for x in sf.read().split(',')]
    #sw=[]
    #[sw.append(x) for x in stopwords if x not in sw]
    #print len(sw)
    #stopwords=sw
    #of=open('stopwords1.txt','w')
    #of.write(','.join(stopwords))
    #of.close()
    #for line in f.readlines():
    #    line=re.sub(r'\s','',line)#eine=line.strip()
    #    res+=[x.strip() for x in line.split("/")]
    for x in count[0].most_common(20):
        print x[0],x[1]
    print len(count),len(count[0])
    return count
Пример #4
0
def tfidf(max_features=5000, prefix="extraction-", begin=1, end=26):
    # get stopwords
    sf = open('chi_stopwords.txt', 'r')
    stopwords = [x.strip() for x in sf.read().split(',')]
    vectorizer = tv(max_features=max_features)  #tokenizer=tokenizer)
    d = {}
    st = time.time()
    d, txt = getText(prefix=prefix, begin=begin, end=end)
    getdatatime = time.time()
    print getdatatime - st
    corpus = {}
    for i in range(len(txt)):  #d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False)))
    tfidf = vectorizer.fit_transform(corpus.values()).toarray()
    print tfidf.shape
    voc = vectorizer.get_feature_names()
    wordssum = tfidf.sum(axis=0)
    index = range(len(voc))
    index = [
        index
        for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True)
        if x.encode('utf-8') not in stopwords
    ]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    tfidfret = []
    print time.time() - getdatatime
    return tfidf, voc, txt
Пример #5
0
def jiebaCount(max_features=5000, prefix="extraction-", begin=1, end=1):
    sf = open('chi_,.txt', 'r')
    d, txt = getText(prefix=prefix, begin=begin, end=end)
    print "Data loaded."
    res = []
    count = []
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    sw = stopwords
    print len(txt)
    st = time.time()
    for i in range(len(txt)):
        r = " ".join(jieba.cut(txt[i])).split(" ")
        r = [x.strip() for x in r]
        r = filter(None, r)
        r = [x for x in r if not x in stopwords]
        res.append(r)
        count.append(Counter(r))
        #print i
    print "Counting cost " + str(time.time() - st) + " seconds."
    #sw = [x.strip() for x in sf.read().split(',')]
    #sw=[]
    #[sw.append(x) for x in stopwords if x not in sw]
    #print len(sw)
    #stopwords=sw
    #of=open('stopwords1.txt','w')
    #of.write(','.join(stopwords))
    #of.close()
    #for line in f.readlines():
    #    line=re.sub(r'\s','',line)#eine=line.strip()
    #    res+=[x.strip() for x in line.split("/")]
    for x in count[0].most_common(20):
        print x[0], x[1]
    print len(count), len(count[0])
    return count
Пример #6
0
def tfidf(max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1, end=26):
    ### get stopwords
    sf = open('chi_n.txt','r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split('\n')]
    sf.close()
    ### load data
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print "Loading data cost "+str(getdatatime-st)+" seconds."
    ### cut text
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    jsonfile = "tfidf_cut_"+prefix+str(begin)+"_"+str(end)+".json"
    f = open(jsonfile,'w')
    json.dump(corpus,f)
    #f = open(jsonfile,'r')
    #corpus = json.load(f)
    f.close()
    ### tfidf vectorizer
    vectorizer=tv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    tfidf=vectorizer.fit_transform(corpus.values())#.toarray()
    print "Tfidf vectorizing cost "+str(time.time()-getdatatime)+" seconds."
    #print tfidf.shape
    voc=vectorizer.get_feature_names()
    ### sorting vocabulary
    #wordssum = tfidf.sum(axis=0)
    #index=range(len(voc))
    #index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] 
    #voc_sorted = [voc[i] for i in index] 
    ### save to json file
    #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json"
    #data={}
    #data['vocabulary']=voc
    #data['tfidf']=tfidf.tolist()
    #with open(jsonfile,'w') as f:
    #    json.dump(data,f)
    #f.close()
    ### save to pickle file
    pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat"
    f = open(pklfile,'wb')
    cPickle.dump(tfidf,f,-1)
    f.close()
    vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc"
    f = open(vocfile,'w')
    voca=voc
    f.write("\n".join(voca).encode('utf-8'))
    f.close()
    return tfidf,voc,txt
Пример #7
0
def jiebaCounter(max_features=5000,
                 prefix="extraction-",
                 begin=1,
                 end=1,
                 dictionary=""):
    # get stopwords
    sf = open('chi_,.txt', 'r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    if dictionary == "":
        vectorizer = cv(max_features=max_features,
                        stop_words=stopwords)  #tokenizer=tokenizer)
    else:
        vocabulary = open(dictionary, 'r').read().split("\n")
        vectorizer = cv(vocabulary=vocabulary,
                        max_features=max_features,
                        stop_words=stopwords)  #tokenizer=tokenizer)
    d = {}
    st = time.time()
    d, txt = getText(prefix=prefix, begin=begin, end=end)
    getdatatime = time.time()
    print getdatatime - st
    corpus = {}
    for i in range(len(txt)):  #d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False)))
    vect = vectorizer.fit_transform(corpus.values()).toarray()
    print vect.shape
    voc = vectorizer.get_feature_names()
    wordssum = vect.sum(axis=0)
    index = range(len(voc))
    index = [
        index
        for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True)
        if x not in stopwords
    ]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    print time.time() - getdatatime
    return vect, voc, txt
Пример #8
0
parser = argparse.ArgumentParser()
parser.add_argument("max_features", type=int, help="number of max features")
parser.add_argument("prefix", type=str, help="prefix of json files")
parser.add_argument("begin", type=int, help="begin code of json files")
parser.add_argument("end", type=int, help="end code of json files")
parser.add_argument("outputfile", type=str, help="output vocabulary file path")
args = parser.parse_args()

# get stopwords
sf = open('chi_,.txt', 'r')
stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]

# load data
d = {}
st = time.time()
d, txt = getText(prefix=args.prefix, begin=args.begin, end=args.end)
getdatatime = time.time()
print "Loading data cost " + str(getdatatime - st) + " seconds."

# cut words
corpus = {}
for i in range(len(txt)):  #d.items():
    #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
    corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False)))

# tfidf
vectorizer = tv(max_features=args.max_features,
                stop_words=stopwords)  #tokenizer=tokenizer)
tfidf = vectorizer.fit_transform(corpus.values()).toarray()
print tfidf.shape
voc = vectorizer.get_feature_names()
Пример #9
0
parser = argparse.ArgumentParser()
parser.add_argument("max_features",type=int,help="number of max features")
parser.add_argument("prefix",type=str,help="prefix of json files")
parser.add_argument("begin",type=int,help="begin code of json files")
parser.add_argument("end",type=int,help="end code of json files")
parser.add_argument("outputfile",type=str,help="output vocabulary file path")
args=parser.parse_args()

# get stopwords
sf = open('chi_,.txt','r')
stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]

# load data
d={}
st=time.time()
d,txt=getText(prefix=args.prefix,begin=args.begin,end=args.end)
getdatatime=time.time()
print "Loading data cost "+ str(getdatatime-st)+" seconds."

# cut words
corpus={}
for i in range(len(txt)):#d.items():
    #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
    corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))

# tfidf
vectorizer=tv(max_features=args.max_features,stop_words=stopwords)#tokenizer=tokenizer)
tfidf=vectorizer.fit_transform(corpus.values()).toarray()
print tfidf.shape
voc=vectorizer.get_feature_names()
print "Tfidf calculating cost "+str(time.time() - getdatatime)+" seconds."