Exemplo n.º 1
0
def InputWebKB(directory):
    # ----------------------------------------------------------------------------------------------
    # ●WebKBデータを読み込みdatabaseに格納する。
    # ●ディレクトリ構成は以下のようにする。
    # +Cornell  ←directoryにはこのディレクトリを指定。DB=InputWebKB('Cornell')
    #   ├ course
    #   ├   :   department, faculty, other, project, staff
    #   └ student
    # ●実験用: 'C:\Users\hosotac\Documents\修士1年後期\WebKB\Cornell\course\
    #   http_^^cs.cornell.edu^Info^Courses^Current^CS415^CS414.html'
    # ----------------------------------------------------------------------------------------------
    database = []
    index = 0

    # 引数directoryに/が含まれるか確認
    if re.search(r'/$', directory) is None:
        directory = directory + '/'
    
    for i in range(len(categories)):
        category_dir = directory + categories[i]
        filelist = glob.glob(category_dir + '/*')
        print category_dir + str(len(filelist))
        for filename in filelist:
            noun = BeautifulNoun(open(filename))
            token = set(noun.alltokens())
            transaction = [index, i, token]
            database.append(transaction)
            index += 1
    return database
Exemplo n.º 2
0
    def __init__(self, text):
        self.text = text

        # Noiseの削除
        # ストップワード
        # レマタイズ
        noun = BeautifulNoun(self.text, TYPE='txt', LANGUAGE='en', stem='y', lemm='n',
                             stopwords='y')
        allterms = []
        
        for sentence_tokens in noun.tokens:
            # print sentence_tokens
            allterms = allterms + noun.make_bigrams(sentence_tokens)\
                       + noun.make_trigrams(sentence_tokens)
        termfreqlist = []
        for term in allterms:
            m = 0
            for i in range(len(termfreqlist)):
                if term == termfreqlist[i][0]:
                    termfreqlist[i][1] += 1
                    m = 1
            if m == 0:
                termfreqlist.append([term, 1])

        # 頻度を利用して熟語を取り出す
        # sort(termfreqlist) by count and term-length.
        termfreqlist.sort(key=lambda x:len(x[0]), reverse=True)
        termfreqlist.sort(key=lambda x:x[1], reverse=True)
        print termfreqlist
        print

        """# Making Dterms.