Exemplo n.º 1
0
def tfidf(max_features=5000,prefix="extraction-",begin=1, end=26):
    # get stopwords
    sf = open('chi_stopwords.txt','r')
    stopwords = [x.strip() for x in sf.read().split(',')]
    vectorizer=tv(max_features=max_features)#tokenizer=tokenizer)
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print getdatatime-st
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    tfidf=vectorizer.fit_transform(corpus.values()).toarray()
    print tfidf.shape
    voc=vectorizer.get_feature_names()
    wordssum = tfidf.sum(axis=0)
    index=range(len(voc))
    index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] 
    print time.time() - st
    voc_sorted = [voc[i] for i in index] 
    tfidfret = []
    print time.time()-getdatatime
    return tfidf,voc,txt
Exemplo n.º 2
0
def jiebaCounter(max_features=5000,prefix="extraction-",begin=1, end=1,dictionary=""):
    # get stopwords
    sf = open('chi_,.txt','r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    if dictionary=="":
        vectorizer=cv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    else:
        vocabulary=open(dictionary,'r').read().split("\n")
        vectorizer=cv(vocabulary=vocabulary,max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print getdatatime-st
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    vect=vectorizer.fit_transform(corpus.values()).toarray()
    print vect.shape
    voc=vectorizer.get_feature_names()
    wordssum = vect.sum(axis=0)
    index=range(len(voc))
    index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x not in stopwords]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    print time.time()-getdatatime
    return vect,voc,txt
Exemplo n.º 3
0
def jiebaCount(max_features=5000,prefix="extraction-",begin=1, end=1):
    sf=open('chi_,.txt','r')
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    print "Data loaded."
    res=[]
    count=[]
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    sw=stopwords
    print len(txt)
    st=time.time() 
    for i in range(len(txt)):
        r=" ".join(jieba.cut(txt[i])).split(" ")
        r=[x.strip() for x in r]
        r=filter(None,r)
        r=[x for x in r if not x in stopwords]
        res.append(r)
        count.append(Counter(r))
        #print i
    print "Counting cost "+str(time.time()-st)+" seconds."
    #sw = [x.strip() for x in sf.read().split(',')]
    #sw=[]
    #[sw.append(x) for x in stopwords if x not in sw]
    #print len(sw)
    #stopwords=sw
    #of=open('stopwords1.txt','w')
    #of.write(','.join(stopwords))
    #of.close()
    #for line in f.readlines():
    #    line=re.sub(r'\s','',line)#eine=line.strip()
    #    res+=[x.strip() for x in line.split("/")]
    for x in count[0].most_common(20):
        print x[0],x[1]
    print len(count),len(count[0])
    return count
Exemplo n.º 4
0
def tfidf(max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1, end=26):
    ### get stopwords
    sf = open('chi_n.txt','r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split('\n')]
    sf.close()
    ### load data
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print "Loading data cost "+str(getdatatime-st)+" seconds."
    ### cut text
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    jsonfile = "tfidf_cut_"+prefix+str(begin)+"_"+str(end)+".json"
    f = open(jsonfile,'w')
    json.dump(corpus,f)
    #f = open(jsonfile,'r')
    #corpus = json.load(f)
    f.close()
    ### tfidf vectorizer
    vectorizer=tv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    tfidf=vectorizer.fit_transform(corpus.values())#.toarray()
    print "Tfidf vectorizing cost "+str(time.time()-getdatatime)+" seconds."
    #print tfidf.shape
    voc=vectorizer.get_feature_names()
    ### sorting vocabulary
    #wordssum = tfidf.sum(axis=0)
    #index=range(len(voc))
    #index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x.encode('utf-8') not in stopwords] 
    #voc_sorted = [voc[i] for i in index] 
    ### save to json file
    #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json"
    #data={}
    #data['vocabulary']=voc
    #data['tfidf']=tfidf.tolist()
    #with open(jsonfile,'w') as f:
    #    json.dump(data,f)
    #f.close()
    ### save to pickle file
    pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat"
    f = open(pklfile,'wb')
    cPickle.dump(tfidf,f,-1)
    f.close()
    vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc"
    f = open(vocfile,'w')
    voca=voc
    f.write("\n".join(voca).encode('utf-8'))
    f.close()
    return tfidf,voc,txt
Exemplo n.º 5
0
parser = argparse.ArgumentParser()
parser.add_argument("max_features",type=int,help="number of max features")
parser.add_argument("prefix",type=str,help="prefix of json files")
parser.add_argument("begin",type=int,help="begin code of json files")
parser.add_argument("end",type=int,help="end code of json files")
parser.add_argument("outputfile",type=str,help="output vocabulary file path")
args=parser.parse_args()

# get stopwords
sf = open('chi_,.txt','r')
stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]

# load data
d={}
st=time.time()
d,txt=getText(prefix=args.prefix,begin=args.begin,end=args.end)
getdatatime=time.time()
print "Loading data cost "+ str(getdatatime-st)+" seconds."

# cut words
corpus={}
for i in range(len(txt)):#d.items():
    #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
    corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))

# tfidf
vectorizer=tv(max_features=args.max_features,stop_words=stopwords)#tokenizer=tokenizer)
tfidf=vectorizer.fit_transform(corpus.values()).toarray()
print tfidf.shape
voc=vectorizer.get_feature_names()
print "Tfidf calculating cost "+str(time.time() - getdatatime)+" seconds."