Пример #1
0
    def updateDatabase(self):
        #Database.connect()
        #projects = Database.getProject("MyProjectID")
        #print(projects)
        #Database.disconnect()
        #sys.exit()
        if not Text.isTrue(self.cfg["Database"]["Enable"]):
            Msg.showWarning("Database is NOT enabled in {0}".format(
                self.cfgPath))
            return
        Database.connect()
        Database.setDebug(Text.toTrueOrFalse(self.cfg["Database"]["Debug"]))
        with Database.ORM.db_session:
            records = Database.Table.Project.get(ID=self.getProjectID())
            if records is not None:
                records.delete()
                Database.commit()

            projectTable = Database.Table.Project(
                ID=self.getProjectID(),
                Title=Database.sanitize(self.cfg["Title"]),
                Description=Database.sanitize(self.cfg["Description"]),
                DateTime=self.frameworkParams["dateTime"],
                Workflow=self.cfg["Workflow"])

            inputTable = Database.Table.Input(
                ProjectID=projectTable,
                Content=Database.sanitize(
                    File.getContent(self.frameworkParams["inputPath"])),
                Source=Database.sanitize(
                    self.cfg["Workflow"]["Input"]["Source"]),
                PluginName=Database.sanitize(
                    self.cfg["Workflow"]["Input"]["Plugin"]),
                PluginMethod=Database.sanitize(
                    self.cfg["Workflow"]["Input"]["Method"]),
                Plugin=self.cfg["Workflow"]["Input"])

            analyzerTable = Database.Table.Analyzer(
                ProjectID=projectTable,
                Content=Database.sanitize(
                    File.getContent(self.frameworkParams["analyzerPath"])),
                PluginName=Database.sanitize(
                    self.cfg["Workflow"]["Analyzer"]["Plugin"]),
                PluginMethod=Database.sanitize(
                    self.cfg["Workflow"]["Analyzer"]["Method"]),
                Plugin=self.cfg["Workflow"]["Analyzer"])

            content = Database.sanitize(
                File.getContent(self.frameworkParams["translatorPath"]))
            translatorTable = Database.Table.Translator(
                ProjectID=projectTable,
                Content=content,
                ContentParsed=Result.parseTranslatorContent(content),
                PluginName=Database.sanitize(
                    self.cfg["Workflow"]["Translator"]["Plugin"]),
                PluginMethod=Database.sanitize(
                    self.cfg["Workflow"]["Translator"]["Method"]),
                Plugin=self.cfg["Workflow"]["Translator"])

            outputTable = Database.Table.Output(
                ProjectID=projectTable,
                Content=Database.sanitize(
                    File.getContent(self.frameworkParams["outputPath"])),
                Target=Database.sanitize(
                    self.cfg["Workflow"]["Output"]["Target"]),
                PluginName=Database.sanitize(
                    self.cfg["Workflow"]["Output"]["Plugin"]),
                PluginMethod=Database.sanitize(
                    self.cfg["Workflow"]["Output"]["Method"]),
                Plugin=self.cfg["Workflow"]["Output"])

        Database.disconnect()
Пример #2
0
class Article:
    '''
		Class ini akan bertanggung jawab dengan text yang ada dalam artikel
	'''
    def __init__(self, title, body, file_name='', table_name=''):
        self.title = title
        self.body = body
        self.keyword = [
        ]  # ini adalah keyword yang akan digunakan sebagai link
        self.table_name = table_name  # nama tabel untuk offline support

        self.conn = Database(file_name)

    '''
		Untuk mengekstrack keyword kita menggunakan rake
	'''

    def extract_keyword(self):

        # Menggunakan beautiful soup untuk mengambil text dari artikel
        clean = self.get_clean_body()

        # Dnegan rake kita ekstract keyword
        r = Rake(min_length=1, max_length=1)
        r.extract_keywords_from_text(clean)
        self.keyword = r.get_ranked_phrases()

        # Debug di terminal untuk melihat jumlah keyword
        print('Keyword extracted with ' + str(len(self.keyword)) + ' words')

    # Menggunakan beautiful soup untuk mengambil teks
    def get_clean_body(self):
        return BeautifulSoup(self.body, 'lxml').text

    '''
		Keyword yang ada akan di bandingkand engan judul artikel di wikipedia kemudian akan di beri link
	'''

    def get_wiki(self):
        key = []

        # File bisa didapat di kaggles
        file = open('titles.txt')
        for line in file:
            key.append(line[:-1].lower())
        file.close()

        for i in self.keyword:
            if i not in key:
                self.keyword.remove(i)

        for k in self.keyword:
            self.body = self.body.replace(
                ' ' + k + ' ', ' <a href="https://en.wikipedia.org/wiki/' + k +
                '">' + k + '</a> ')

    def get_keyword(self):
        return self.keyword

    def get_title(self):
        return self.title

    def get_content(self):
        return self.body

    def clean_html(self):

        # Drop tag yang menyusahkan dari web
        td = TagDropper(['img', 'h4', 'svg', 'a', 'figure', 'div', 'path'])
        td.feed(self.body)

        self.body = td.get_text()

    '''
		For Database
	'''

    def save_article(self):
        self.save(['title', 'raw_article'], (self.title, self.body))
        self.commit()

    def save(self, column, data):
        self.conn.insert(self.table_name, column, data)

    def commit(self):
        self.conn.commit()