예제 #1
0
    def createTFIDFTopics(self):
        self.db = psycopg2.connect("dbname=%s user=%s password=%s host=%s" % (
            self.dbname, self.dbuser, self.dbpass, self.dbhost))
        c = self.db.cursor()

        headlines = {}
        c.execute(
            "SELECT article_day,country,title,url,article_hash FROM articles_headlines")
        for row in c.fetchall():
            title = row[2]
            # c.execute('SELECT content from articles where hash = ?',(row[4],))
            # content = c.fetchone()[0]

            lista = headlines.get(str(row[0])+'-'+row[1])
            if lista is None:
                # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content]
                headlines[str(row[0])+'-'+row[1]] = [title]
            else:
                # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content)
                headlines[str(row[0])+'-'+row[1]].append(title)
        self.db.close()

        for hd, contents in headlines.items():
            print(f'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {hd}')
            with open('stopwords.txt', 'r') as st:
                tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()])
                tfidf.parse(contents)
예제 #2
0
    def createTFIDFTopics(self):
        self.db = sqlite3.connect(self.dbname,
                                  detect_types=sqlite3.PARSE_DECLTYPES)
        c = self.db.cursor()

        headlines = {}
        c.execute(
            "SELECT article_day,country,title,url,article_hash FROM articles_headlines"
        )
        for row in c.fetchall():
            title = row[2]
            # c.execute('SELECT content from articles where hash = ?',(row[4],))
            # content = c.fetchone()[0]

            lista = headlines.get(str(row[0]) + '-' + row[1])
            if lista is None:
                # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content]
                headlines[str(row[0]) + '-' + row[1]] = [title]
            else:
                # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content)
                headlines[str(row[0]) + '-' + row[1]].append(title)
        self.db.close()

        for hd, contents in headlines.iteritems():
            print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ' + hd
            with open('stopwords.txt', 'r') as st:
                tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()])
                tfidf.parse(contents)