예제 #1
0
    def __init__(self, dir_text_file, dir_output_file,\
            windowsize, docsuffix, wordmap_file):
        self.mapfile_suffix = ".idmap"
        self.graphfile_suffix = ".graph"
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.dir_text_file = dir_text_file
        self.dir_output_file = dir_output_file
        self.windowsize = windowsize
        self.docsuffix = docsuffix
        self.wordmap_file = wordmap_file

        self.doclist = []
        self.getdoclist()
        self.corp_wordmap = {}
        self.readwordmap()
예제 #2
0
class GraphGenerator:
    '''This class mainly construct graph for basic random
       walk method.
       Method: graph construction based on sliding window
               strategy.
       Ouput : two files-->1.graph file for each document;
                           2.word id in document to word map dictionary.
       Currently, we implements two graph representation, i.e., dense
       graph representation and sprase graph representation.
    '''
    def __init__(self, dir_text_file, dir_output_file,\
            windowsize, docsuffix, wordmap_file):
        self.mapfile_suffix = ".idmap"
        self.graphfile_suffix = ".graph"
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.dir_text_file = dir_text_file
        self.dir_output_file = dir_output_file
        self.windowsize = windowsize
        self.docsuffix = docsuffix
        self.wordmap_file = wordmap_file

        self.doclist = []
        self.getdoclist()
        self.corp_wordmap = {}
        self.readwordmap()

    def getdoclist(self):
        for rootdir in self.dir_text_file:
            candi_files = os.listdir(rootdir)
            # filter file by suffix if existed
            if self.docsuffix:
                candi_files = filter(self.substr, candi_files)
            candi_files = map(lambda x: rootdir+x, candi_files)
            self.doclist = self.doclist + candi_files

    def substr(self, candi_file):
        if candi_file.find(self.docsuffix) != -1:
            return True
        return False

    def readwordmap(self):
        for line in open(self.wordmap_file):
            biparts = line.strip("\r\n ").split(" ")
            self.corp_wordmap[biparts[0]] = int(biparts[1])

    # filter words based on stopwords list and character rule
    def filterwords(self, textline):
        stemmed_words = []
        saved_words = []
        words = textline.split(" ")
        for word in words:
            if word == " ":
                continue
            biparts = word.split("_")
            # words processing (stopword, stemming, lower)
            # ============================================
            biparts[0] = biparts[0].lower()
            biparts[0] = self.stemmer.stem(biparts[0], 0, \
                    len(biparts[0])-1)
            stemmed_words.append(biparts[0])
            if biparts[1] in POS:
                if not self.stopWords.is_stopword(biparts[0])\
                        and self.pattern.match(biparts[0]):
                    saved_words.append(biparts[0])
            # ============================================
        return stemmed_words, saved_words

    # graph construction
    # strategy: 1.filter words accroding to POS tags;
    #           2.construct graph based on sliding window.
    def construct(self):
        for doc in self.doclist:
            doc_prefix = doc.split('/')[-1].split('.')[0]
            output_graphfile = self.dir_output_file + doc_prefix \
                    + self.graphfile_suffix
            output_mapfile = self.dir_output_file + doc_prefix \
                    + self.mapfile_suffix

            cleaned_wordslist = []
            stemmed_wordslist = []
            for line in open(doc):
                line = line.strip('\n\r ')
                stemmed_words, cleaned_words = self.filterwords(line)
                cleaned_wordslist = cleaned_wordslist + cleaned_words
                stemmed_wordslist = stemmed_wordslist + stemmed_words
            wordsmap_indoc = self.numword_indoc(cleaned_wordslist)
            #print wordsmap_indoc
            pairids = self.mapwordspair(wordsmap_indoc)
            pairids = sorted(pairids, key=lambda x: x[0])
            dense_graph = self.slidingwindow(stemmed_wordslist, wordsmap_indoc)
            self.output_graph('dense', dense_graph, output_graphfile)
            self.output_graph('sparse', dense_graph, output_graphfile)
            self.output_map(pairids, output_mapfile)

    def mapwordspair(self, ids_indoc):
        pairids = []
        for key in ids_indoc.keys():
            pairids.append([ids_indoc[key], self.corp_wordmap[key]])
        return pairids

    def slidingwindow(self, stemmed_wordslist, wordsmap_indoc):
        dense_graph = np.array([0.0 for i in range(len(wordsmap_indoc)\
                *len(wordsmap_indoc))])
        dense_graph = dense_graph.reshape(len(wordsmap_indoc), len(wordsmap_indoc))

        for i, word in enumerate(stemmed_wordslist):
            if stemmed_wordslist[i] in wordsmap_indoc:
                sliding_text = stemmed_wordslist[max(0, i-self.windowsize):\
                        min(len(stemmed_wordslist), i+self.windowsize+1)]
                for j in range(len(sliding_text)):
                    if stemmed_wordslist[i] == sliding_text[j]:
                        continue
                    if sliding_text[j] in wordsmap_indoc:
                        dense_graph[wordsmap_indoc[stemmed_wordslist[i]]-1,\
                                wordsmap_indoc[sliding_text[j]]-1] += 1
        return dense_graph

    def numword_indoc(self, wordslist_indoc):
        wordsmap_indoc = {}
        word_id = 1
        for word in wordslist_indoc:
            if word not in wordsmap_indoc:
                wordsmap_indoc[word] = word_id
                word_id += 1
        return wordsmap_indoc

    def output_graph(self, choice, graphdata, graphfile):
        #print graphfile
        if choice == "dense":
            wfd = open(graphfile+'.dense', 'w')
            for i in range(len(graphdata)):
                wfd.write("%s\n"%' '.join(map(lambda x: str(x), graphdata[i])))
        elif choice == "sparse":
            wfd = open(graphfile+'.sparse', 'w')
            for i in range(len(graphdata)):
                for j in range(len(graphdata)):
                    if graphdata[i,j] != 0:
                        wfd.write("%d %d %d\n" % (i, j, graphdata[i, j]))
        wfd.close()

    def output_map(self, mapdata, mapfile):
        #print mapfile
        wfd = open(mapfile, 'w')
        for i in range(len(mapdata)):
            wfd.write("%d\n" % mapdata[i][1])
        wfd.close()