Exemplo n.º 1
0
def procress_nem_together():
    dir = r"F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages"
    file_stopwords = file('stopwords.txt', "r")
    stopwords = [line.strip() for line in file_stopwords.readlines()]
    scan = FileUtil.ScanFile(dir, postfix='.nem')
    files = scan.scan_files()
    corpus = []
    for f in files:
        fr = file(f, 'r')
        corpus_file_path = str(f)[:-3] + 'token'
        fw = file(corpus_file_path, 'a')
        content = str(fr.read()).strip()
        if len(content) == 0:
            continue
        encoding = chardet.detect(content)['encoding']
        content = content.decode(encoding).encode('utf-8')
        content = re.sub("[\.\@\,\:\;\!\?\(\)]", " ", content)
        content = content.split()
        stemmer = SnowballStemmer('english')
        pro_content = ''
        for w in content:
            #w = stemmer.stem(w)
            #去停用词
            if w not in stopwords:
                pro_content += ' ' + w + ' '
        fw.write(pro_content)
        fw.close()
        fr.close()
    return
Exemplo n.º 2
0
def getNameEntity():
    dir = r"F:/Data/homework/clustering/test_data/ae_gold_standard/AE"
    scan = FileUtil.ScanFile(dir, postfix='.txt')
    files = scan.scan_files()
    for f in files:
        fr = file(f, 'r')
        ne_list = ['' for i in range(151)]
        for line in fr:
            line_list = line.split()
            for i in line_list[2:]:
                ne_list[int(line_list[0])] += str(i) + ' '
        for i, ne in enumerate(ne_list):
            if len(ne) is 0:
                print '----------'
                continue
            file_name = '/' + str(i).zfill(3) + '.nem'
            name_pos = str(f).rfind('\\')
            folder_name = str(f)[name_pos + 1:-4]
            file_append_path = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages/" + folder_name + file_name
            fw = open(file_append_path, 'w')
            #metadata = '\n<name_entity>\n'
            metadata = ne
            #metadata += '\n</name_entity>'
            metadata = fw.write(metadata)
            fw.close()
        fr.close()
Exemplo n.º 3
0
def procress_together():
    dir = r"F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages"
    file_stopwords = file('stopwords.txt', "r")
    stopwords = [line.strip() for line in file_stopwords.readlines()]
    scan = FileUtil.ScanFile(dir, postfix='.f')
    files = scan.scan_files()
    corpus = []
    for f in files:
        fr = file(f, 'r')
        corpus_file_path = str(f)[:-1] + 'token'
        fw = file(corpus_file_path, 'w')
        soup = BeautifulSoup(fr.read())
        title = str(soup.title.string).strip()
        snippet = str(soup.snippet.string).strip()
        content = title + ' ' + snippet
        content = re.sub("[\.\@\,\:\;\!\?\(\)]", " ", content)
        content = content.split()
        stemmer = SnowballStemmer('english')
        pro_content = ''
        for w in content:
            #w = stemmer.stem(w)
            #去停用词
            if w not in stopwords:
                pro_content += ' ' + w + ' '
        fw.write(pro_content)
        fw.close()
        fr.close()
    return
Exemplo n.º 4
0
def getMetadata():
    dir = r"F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages"
    scan = FileUtil.ScanFile(dir, postfix='.html')
    files = scan.scan_files()
    for f in files:
        fr = file(f, 'r')
        file_append_path = str(f)[:-4] + 'nem'
        fw = open(file_append_path, 'a')
        soup = BeautifulSoup(fr.read())
        metas = soup.findAll('meta')
        #metadata = '\n<metadata>\n'
        metadata = ''
        if metas is not None:
            try:
                for m in metas:
                    for attr in m.attrs:
                        for i in attr:
                            if str(i).lower() == 'name':
                                if str(m['name']).lower() == 'description':
                                    metadata += m['content']
                                if str(m['name']).lower() == 'keywords':
                                    metadata += m['content']

            except:
                print '-------------except------------------'
        fw.write(metadata)
        fw.close()
        fr.close()
    return
Exemplo n.º 5
0
def discard():
    dir = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/gold_standard"
    scan = FileUtil.ScanFile(dir, postfix='.xml')
    files = scan.scan_files()
    dis = [[] for i in range(30)]
    for n, f in enumerate(files):
        fr = file(f, 'r')
        soup = BeautifulSoup(fr.read())
        dis_doc = soup.discarded.contents
        for i, doc in enumerate(dis_doc):
            if i == 0:
                continue
            dis[n].append(doc['rank'])
    return dis
Exemplo n.º 6
0
def delete_file():
    dir = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages"
    scan = FileUtil.ScanFile(dir, postfix='.netoken')
    files = scan.scan_files()
    for f in files:
        os.remove(f)
Exemplo n.º 7
0
def ap(folder, d):
    dir = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages/" + folder
    scan = FileUtil.ScanFile(dir, postfix='.token')
    files = scan.scan_files()
    clusting_result_path = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/results/" + folder + '.xml'
    isExists = os.path.exists(clusting_result_path)
    if isExists:
        print 'exist'
        return
    corpus = []
    dis_list = []
    for i in d:
        dis_list.append(int(i))
    clust_list = []
    for f in files:
        #去掉discard
        name_pos = str(f).rfind('\\') + 1
        dn = int(f[name_pos:-6])
        #print type(dn)
        if dn in dis_list:
            continue
        clust_list.append(dn)
        fr = file(f, 'r')
        content = str(fr.read()).strip()
        encoding = chardet.detect(content)['encoding']
        content = content.decode(encoding).encode('utf-8')
        #content = content.split()
        corpus.append(content)
        fr.close()
    vectorizer = CountVectorizer(stop_words='english')
    transformer = TfidfTransformer()
    tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word = vectorizer.get_feature_names()
    weight = tf_idf.toarray()
    '''
    for i in range(len(weight)):
        for j in range(len(word)):  
            print str(word[j]).encode("utf-8"), str(weight[i][j]).encode("utf-8")  
        return
        '''
    agg = AffinityPropagation()
    agg.fit(tf_idf)
    labels = agg.labels_
    s = 0
    result = [[] for i in range(labels.size)]
    dis_len = len(dis_list)
    for i in range(150 - dis_len):
        result[labels[i]].append(clust_list[i])
    fw = file(clusting_result_path, 'w')
    doc = Document()
    root = doc.createElement('clustering')
    doc.appendChild(root)
    for i, e in enumerate(result):
        entity = doc.createElement('entity')
        entity.setAttribute('id', str(i))
        for r in e:
            rank = doc.createElement('doc')
            rank.setAttribute('rank', str(r))
            entity.appendChild(rank)
        root.appendChild(entity)
    fw.write(doc.toprettyxml(indent='  ', encoding='UTF-8'))
    print 'writing success'
    fr.close()
    fw.close()
Exemplo n.º 8
0
def tfidf_vector(folder, d):
    dir = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/web_pages/" + folder
    scan = FileUtil.ScanFile(dir, postfix='.token')
    files = scan.scan_files()
    clusting_result_path = "F:/Data/homework/clustering/test_data/WePS2_test_data/data/test/results/" + folder + '.xml'
    isExists = os.path.exists(clusting_result_path)
    if isExists:
        print 'exist'
        return
    corpus = []
    dis_list = []
    clust_list = []
    for i in d:
        dis_list.append(int(i))
    for f in files:
        #去掉discard
        name_pos = str(f).rfind('\\') + 1
        dn = int(f[name_pos:-6])
        #print type(dn)
        if dn in dis_list:
            continue
        clust_list.append(dn)
        fr = file(f, 'r')
        content = str(fr.read()).strip()
        encoding = chardet.detect(content)['encoding']
        content = content.decode(encoding).encode('utf-8')
        #content = content.split()
        corpus.append(content)
        fr.close()
    #vectorizer = CountVectorizer(stop_words='english')
    tfv = TfidfVectorizer(min_df=3,
                          max_features=None,
                          strip_accents='unicode',
                          analyzer='word',
                          token_pattern=r'\w{1,}',
                          ngram_range=(1, 2),
                          use_idf=1,
                          smooth_idf=1,
                          sublinear_tf=1,
                          stop_words='english')
    tfv.fit(corpus)
    tfidf_all = tfv.transform(corpus)
    cluster_num = 19
    kmeans = KMeans(n_clusters=cluster_num)
    kmeans.fit(tfv)
    labels = kmeans.labels_
    s = 0
    result = [[] for i in range(cluster_num)]
    dis_len = len(dis_list)
    for i in range(150 - dis_len):
        result[labels[i]].append(clust_list[i])
    fw = file(clusting_result_path, 'w')
    doc = Document()
    root = doc.createElement('clustering')
    doc.appendChild(root)
    for i, e in enumerate(result):
        entity = doc.createElement('entity')
        entity.setAttribute('id', str(i))
        for r in e:
            rank = doc.createElement('doc')
            rank.setAttribute('rank', str(r))
            entity.appendChild(rank)
        root.appendChild(entity)
    fw.write(doc.toprettyxml(indent='  ', encoding='UTF-8'))
    print 'writing success'
    fr.close()
    fw.close()