Exemplo n.º 1
0
def UpdateSimilarityDatabase(target=None, title=None, outPut=None):
    global embedding_model

    if title == None:
        title = 'Append Similar word relation'
    else:
        title += '\nAppend Similar word relation'
    if outPut == None:
        outPut = ''
    else:
        outPut = 'Process data of ' + outPut + '\n'

    main.ShowTitle(title, outPut + 'Getting exist data')

    if target == None:
        relationDict = {}
        relationList = DataBaseManager.DoSQL("""
        SELECT  Normal_Word, Target_Word, Similar_Relation_ID
        FROM    similar_word_relation
        """)
        for relation in relationList:
            if relation[0] == relation[1]:
                newWordDict = {relation[0]: {}}
                relationDict.update(newWordDict)
            else:
                newRelation = {relation[1]: relation[2]}
                relationDict.get(relation[0]).update(newRelation)
    else:
        sqlResult = DataBaseManager.DoSQL("""
        SELECT  Category_ID, Relation_Table_Name
        FROM    product_dic
        WHERE   Product_ID = """ + str(target) + """
        """)
        productInfo = sqlResult[0]

        featureList = [productInfo[1]]
        sqlResult = DataBaseManager.DoSQL("""
        SELECT  Feature_Name
        FROM    feature_dic
        WHERE   Category_ID = """ + str(productInfo[0]) + """
        """)
        for result in sqlResult:
            featureList.append(result[0])

    if target == None:
        main.ShowTitle(title,
                       outPut + 'Getting latest calculated similar data')
        wordList = []
        for word in embedding_model.wv.index2word:
            wordList.append(word)

        wordDict = {}
        removeList = []
        index = 0
        updateTime = 0
        while True:
            currentTime = int(
                str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
            if updateTime < currentTime:
                updateTime = currentTime
                main.ShowTitle(
                    title, outPut + 'Removing not verb and adjective (' +
                    str(index) + '/' + str(len(wordList)) + ' removed: ' +
                    str(len(removeList)) + ')')
            word = wordList[index]
            targetTag = ['VA', 'VV']
            if len(NLP.DoNLP(word, targetTag)) <= 0:
                removeList.append(index)
            index += 1
            if index >= len(wordList):
                break

        removeList.sort(reverse=True)
        for index in removeList:
            wordList.pop(index)

        wordDict = dict.fromkeys(wordList)

    insertQuery = []
    updateQuery = []
    index = 0
    if target == None:
        updateTime = 0
        for word in wordDict.keys():
            result = embedding_model.most_similar(
                positive=[word], topn=len(embedding_model.wv.index2word) - 1)
            currentTime = int(
                str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
            if updateTime < currentTime:
                updateTime = currentTime
                main.ShowTitle(
                    title, outPut + 'Append query (' + str(index) + '/' +
                    str(len(wordList)) + ')')
            for similar in result:
                try:
                    wordDict[similar[0]]
                except:
                    continue
                else:
                    existData = relationDict.get(word)
                    try:
                        relationID = existData[similar[0]]
                    except:
                        newQuery = """
                        INSERT INTO similar_word_relation (Normal_Word, Target_Word, Similar_Value)
                        VALUES      ('""" + word + """', '""" + similar[
                            0] + """', """ + str(similar[1]) + """)"""
                        insertQuery.append(newQuery)
                    else:
                        newQuery = """
                        UPDATE  similar_word_relation
                        SET     Similar_Value = """ + str(similar[1]) + """
                        WHERE   Similar_Relation_ID = """ + str(relationID)
                        updateQuery.append(newQuery)

            index += 1
    else:
        SentiWordBinder = BindSentiWords.BindSentiWords()
        updateTime = 0
        for feature in featureList:
            result = embedding_model.most_similar(
                positive=[feature],
                topn=len(embedding_model.wv.index2word) - 1)
            currentTime = int(
                str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
            if updateTime < currentTime:
                updateTime = currentTime
                main.ShowTitle(
                    title, outPut + 'Append query (' + str(index) + '/' +
                    str(len(featureList)) + ')')
            for similar in result:
                if feature != similar[0] and productInfo[1] != similar[0]:
                    # if feature != productInfo[1]:
                    #     try:
                    #         wordDict[similar[0]]
                    #     except:
                    #         updateQuery.append("""
                    #         UPDATE  `""" + productInfo[1] + """`
                    #         SET     `""" + feature + """` = null
                    #         WHERE   Word = '""" + similar[0] + """'
                    #         """)
                    #     else:
                    #         updateQuery.append("""
                    #         UPDATE  `""" + productInfo[1] + """`
                    #         SET     `""" + feature + """` = """ + str(similar[1]) + """
                    #         WHERE   Word = '""" + similar[0] + """'
                    #         """)
                    # else:
                    updateQuery.append("""
                    UPDATE  `""" + productInfo[1] + """`
                    SET     `""" + feature + """` = """ + str(similar[1]) + """
                    WHERE   Word = '""" + similar[0] + """'
                    """)

                    sentiValueDict = SentiWordBinder.BindSentiWords(
                        [similar[0]])
                    if sentiValueDict[similar[0]] != 'None':
                        updateQuery.append("""
                        UPDATE  `""" + productInfo[1] + """`
                        SET     Sentiment_Value = """ +
                                           sentiValueDict[similar[0]] + """
                        WHERE   Word = '""" + similar[0] + """'
                        """)

            updateQuery.append("""
            UPDATE  `""" + productInfo[1] + """`
            SET     `""" + feature + """` = null
            WHERE   Word_Count <= """ + str(5) + """
            """)

            index += 1

    if target == None:
        db = 'db_capstone'
    else:
        db = 'db_capstone_similarity'

    DataBaseManager.DoManyQuery(insertQuery,
                                db=db,
                                title=title,
                                outPut=outPut,
                                queryType='INSERT')
    DataBaseManager.DoManyQuery(updateQuery,
                                db=db,
                                title=title,
                                outPut=outPut,
                                queryType='UPDATE')
Exemplo n.º 2
0
def UpdateSimilarWordDictionary(title=None, outPut=None):
    if title == None:
        title = 'Update Similar word Dictionary'
    else:
        title += '\nUpdate Similar word Dictionary'
    if outPut == None:
        outPut = ''
    else:
        outPut = 'Process data of ' + outPut + '\n'

    main.ShowTitle(title, 'Getting similarity data')
    sqlResult = DataBaseManager.DoSQL("""
    SELECT  Normal_Word, Target_Word, Word_Count
    FROM    similar_word_relation
    WHERE   Similar_Value > 0.95
    ORDER BY Similar_Value DESC
    """)

    index = 0
    wordDict = {}
    updateTime = 0
    for result in sqlResult:
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title, outPut + 'Building relation dictionary (' + str(index) +
                '/' + str(len(sqlResult)) + ')')
        newRelation = {result[1]: result[2]}
        try:
            existRelation = wordDict[result[0]]
        except:
            wordRelation = {result[0]: newRelation}
        else:
            existRelation.update(newRelation)
            wordRelation = {result[0]: existRelation}
        wordDict.update(wordRelation)
        index += 1

    index = 0
    removeIndex = 0
    initialLength = len(wordDict)
    removeList = []
    updateTime = 0
    for key, relation in wordDict.items():
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title, 'Removing unnecessary word (' + str(index) + '/' +
                str(initialLength) + ' removed: ' + str(removeIndex) + ')')
        if len(relation) <= 1:
            removeList.append(key)
            removeIndex += 1
        index += 1

    index = 0
    updateTime = 0
    for key in removeList:
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title,
                outPut + 'Removing unnecessary word (' + str(index) + '/' +
                str(initialLength) + ' removed: ' + str(removeIndex) + ')')
        wordDict.pop(key)

    relatedWordDict = {}
    SentiWordBinder = BindSentiWords.BindSentiWords()
    index = 0
    updateTime = 0
    for key, value in wordDict.items():
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title, outPut + 'Calculating sentimental value (' +
                str(index) + '/' + str(len(wordDict)) + ')')
        compareList = []
        compareList.extend(value.keys())
        sentiValueDict = SentiWordBinder.BindSentiWords(compareList)

        keySentiValue = sentiValueDict[key]
        if keySentiValue != 'None':
            sentiValueDict.pop(key)
            for word, targetSentiValue in sentiValueDict.items():
                if targetSentiValue != 'None':
                    if int(keySentiValue) == int(targetSentiValue):
                        if value[key] > wordDict[word][word]:
                            newRelation = {word: key}
                        elif value[key] < wordDict[word][word]:
                            newRelation = {key: word}
                        relatedWordDict.update(newRelation)

    index = 0
    updateTime = 0
    for subWord, superWord in relatedWordDict.items():
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title, outPut + 'Building similar word dictionary (' +
                str(index) + '/' + str(len(relatedWordDict)) + ')')
        try:
            upperWord = relatedWordDict[superWord]
        except:
            continue
        else:
            relatedWordDict[subWord] = upperWord
            for key, value in relatedWordDict.items():
                if value == superWord:
                    relatedWordDict[key] = upperWord

    main.ShowTitle(title, outPut + 'Getting exist similar word dictionary')
    sqlResult = DataBaseManager.DoSQL("""
    SELECT  Sub_Word, Similar_ID
    FROM    similar_word_dic
    """)
    existRelatedWordDict = dict(sqlResult)
    insertQuery = []
    updateQuery = []

    index = 0
    updateTime = 0
    for subWord, superWord in relatedWordDict.items():
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title, outPut + 'Appending Query (' + str(index) + '/' +
                str(len(relatedWordDict)) + ')')
        try:
            dictionaryID = existRelatedWordDict[subWord]
        except:
            insertQuery.append("""
            INSERT INTO similar_word_dic (Sub_Word, Super_Word)
            VALUES ('""" + subWord + """', '""" + superWord + """')
            """)
        else:
            updateQuery.append("""
            UPDATE  similar_word_dic
            SET     Super_Word
            WHERE   Similar_ID = """ + str(dictionaryID) + """
            """)

    DataBaseManager.DoManyQuery(insertQuery,
                                title=title,
                                outPut=outPut,
                                queryType='INSERT')
    DataBaseManager.DoManyQuery(updateQuery,
                                title=title,
                                outPut=outPut,
                                queryType='UPDATE')
Exemplo n.º 3
0
def AppendWordDicQuery(title=None, outPut=None):
    global productDic
    global productSimilarDic
    global similarInsertQuery
    global similarUpdateQuery
    global insertQuery
    global updateQuery

    if title == None:
        title = ''
    if outPut == None:
        outPut = ''

    featureDic = {}
    sqlResult = DataBaseManager.DoSQL("""
    SELECT  Feature_Name, Category_ID
    FROM    feature_dic
    """)

    for result in sqlResult:
        try:
            existFeatureList = featureDic[result[1]]
        except:
            existFeatureList = []

        existFeatureList.append(result[0])
        newDict = {result[1]: existFeatureList}
        featureDic.update(newDict)

    createQuery = []
    updateRelateTableQuery = []

    for name, wordDict in productSimilarDic.items():
        if productDic[name]['relationTable'] == None:
            try:
                featureList = [name]
                featureList[1:] = featureDic[productDic[name]['categoryID']]
            except:
                featureList = [name]

            createQuery.append("""
            CREATE TABLE        `db_capstone_similarity`.`""" + name + """` (
                `Word_ID`       INT(11) NOT NULL AUTO_INCREMENT,
                `Word`          TEXT    NOT NULL,
                `Word_Count`    INT(11) NOT NULL DEFAULT '0',
                `Sentiment_Value` INT(11) NULL DEFAULT NULL,`""" +
                               '`FLOAT NULL DEFAULT NULL,`'.join(featureList) +
                               """`          FLOAT NULL DEFAULT NULL,
                PRIMARY KEY (`Word_ID`)
            ) ENGINE = InnoDB
            """)

            updateRelateTableQuery.append("""
            UPDATE  product_dic
            SET     Relation_Table_Name = '""" + name + """'
            WHERE   Product_ID = """ + str(productDic[name]['productID']) + """
            """)

            productDic[name]['relationTable'] = name
            existWord = {}
        else:
            WordList = []
            WordList.extend(
                DataBaseManager.DoSQL(
                    """
            SELECT  `Word`, `Word_ID`
            FROM    `""" + name + """`""", 'db_capstone_similarity'))
            existWord = dict(WordList)

        for word, count in wordDict.items():
            try:
                wordID = existWord[word]
            except:
                similarInsertQuery.append("""
                INSERT INTO `""" + name + """` (`Word` ,`Word_Count`)
                VALUES ('""" + word + """', """ + str(count) + """)
                """)
            else:
                similarUpdateQuery.append("""
                UPDATE  `""" + name + """`
                SET     `Word_Count` = `Word_Count` + """ + str(count) + """
                WHERE   `Word_ID` = """ + str(wordID) + """
                """)

    DataBaseManager.DoManyQuery(createQuery,
                                title=title,
                                outPut=outPut,
                                queryType='CREATE')
    DataBaseManager.DoManyQuery(updateRelateTableQuery,
                                title=title,
                                outPut=outPut,
                                queryType='UPDATE')
Exemplo n.º 4
0
def Dividing(reviewData, fileName, title=None, outPut=None):
    global similarInsertQuery
    global similarUpdateQuery
    global insertQuery
    global updateQuery

    if title == None:
        title = ''
    if outPut == None:
        outPut = ''

    similarInsertQuery = []
    similarUpdateQuery = []
    insertQuery = []
    updateQuery = []

    completeIndex = 0
    noProductIndex = 0
    skippedIndex = 0

    sqlResult = DataBaseManager.DoSQL("""
    SELECT  Review_ID, Review_Number
    FROM    review_dic
    """)
    completedReview = dict(sqlResult)

    updateTime = 0
    for data in reviewData:
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            additionString = ''
            if noProductIndex > 0 or skippedIndex > 0:
                additionString += ' ('
                if skippedIndex > 0:
                    additionString += 'skipped: ' + str(skippedIndex)
                if noProductIndex > 0 and skippedIndex > 0:
                    additionString += ' / '
                if noProductIndex > 0:
                    additionString += 'not product: ' + str(noProductIndex)
                additionString += ')'
            main.ShowTitle(
                title, outPut + 'Building dictionary for ' + fileName + ' (' +
                str(completeIndex + skippedIndex + noProductIndex) + '/' +
                str(len(reviewData)) + ')' + additionString)

        splitData = data.split(',')
        if len(splitData) < 2:
            return 'No data in ' + fileName

        reviewNumber = fileName + '-' + splitData[0]
        splitData.remove(splitData[0])
        reviewTitleString = splitData[0]
        reviewTitleString = reviewTitleString.replace('\n', '')
        reviewTitleString = reviewTitleString.replace(';', ',')
        splitData.remove(splitData[0])
        reviewString = ''.join(splitData)
        reviewString = reviewString.replace('\n', '')
        reviewString = reviewString.replace(';', ',')

        if reviewNumber in completedReview.values():
            skippedIndex += 1
            continue

        if reviewTitleString == '!e':
            continue

        reviewTitleStringList = NLP.DoNLP(reviewTitleString, None, 'Review')
        reviewStringList = NLP.DoNLP(reviewString, None, 'Review')

        resultList = GetProductName(
            ' '.join(reviewTitleStringList),
            ' '.join(reviewStringList)).get('product_Name')

        if len(resultList) > 0:
            resultStringList = DictionaryBuilder.ConvertNormalWord(
                mainStringList=reviewTitleStringList,
                subStringList=reviewStringList,
                mode='Review')
            resultString = '#'.join(resultStringList)

            for name in resultList:
                updateQuery.append("""
                UPDATE  product_dic
                SET     Count = Count + 1
                WHERE   Product_Name = '""" + name + """'
                """)
                insertQuery.append("""
                INSERT INTO review_dic (Review_Number, Review, Product_ID)
                VALUES ('""" + reviewNumber + """', '""" + resultString +
                                   """', """ +
                                   str(productDic[name].get('productID')) +
                                   """)
                """)

                for word in resultStringList:
                    try:
                        wordDict = productSimilarDic[name]
                    except:
                        wordDict = {}

                    try:
                        currentCount = wordDict[word]
                    except:
                        currentCount = 0

                    wordInfo = {word: currentCount + 1}
                    wordDict.update(wordInfo)
                    newItem = {name: wordDict}
                    productSimilarDic.update(newItem)

            completeIndex += 1
        else:
            noProductIndex += 1

    returnString = "Complete building dictionary for " + fileName
    if skippedIndex > 0 or noProductIndex > 0:
        returnString += ' ('
        if skippedIndex > 0:
            returnString += 'skipped: ' + str(skippedIndex)
        if skippedIndex > 0 and noProductIndex > 0:
            returnString += ' / '
        if noProductIndex > 0:
            returnString += 'not product: ' + str(noProductIndex)
        returnString += ')'
    returnString += '\n'

    AppendWordDicQuery(title=title, outPut=outPut + returnString)
    DataBaseManager.DoManyQuery(insertQuery,
                                title=title,
                                outPut=outPut + returnString,
                                queryType='INSERT')
    DataBaseManager.DoManyQuery(updateQuery,
                                title=title,
                                outPut=outPut + returnString,
                                queryType='UPDATE')
    DataBaseManager.DoManyQuery(similarInsertQuery,
                                'db_capstone_similarity',
                                title=title,
                                outPut=outPut + returnString,
                                queryType='INSERT')
    DataBaseManager.DoManyQuery(similarUpdateQuery,
                                'db_capstone_similarity',
                                title=title,
                                outPut=outPut + returnString,
                                queryType='UPDATE')

    return returnString
Exemplo n.º 5
0
def AppendArticleDic(reviewData, fileName, title=None, outPut=None):
    global insertQuery
    global updateQuery

    if title == None:
        title = ''
    if outPut == None:
        outPut = ''

    insertQuery = []
    updateQuery = []
    stackUnit = DataBaseManager.maximumQueryStactUnit

    completeIndex = 0
    skippedIndex = 0

    articleLastID = DataBaseManager.DoSQL("""
    SELECT `AUTO_INCREMENT`
    FROM  INFORMATION_SCHEMA.TABLES
    WHERE TABLE_SCHEMA = 'db_capstone'
    AND   TABLE_NAME   = 'article_dic';
    """)[0][0]
    articleNumberList = []
    index = 0
    while True:
        articleNumberList.extend(
            DataBaseManager.DoSQL("""
        SELECT  Article_ID, Article_Number
        FROM    article_dic
        WHERE   Article_ID > """ + str(index) + """ AND Article_ID <= """ +
                                  str(index + stackUnit)))
        index += stackUnit
        if index > articleLastID:
            break
    completedReview = dict(articleNumberList)
    wordDic = {}

    main.ShowTitle(
        title, outPut + 'Building dictionary for ' + fileName + ' (' +
        str(completeIndex) + '/' + str(len(reviewData)) + ')')
    updateTime = int(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
    for data in reviewData:
        currentTime = int(str(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
        if updateTime < currentTime:
            updateTime = currentTime
            main.ShowTitle(
                title, outPut + 'Building dictionary for ' + fileName + ' (' +
                str(completeIndex + skippedIndex) + '/' +
                str(len(reviewData)) + ')')

        splitData = data.split(',')
        if len(splitData) < 2:
            return 'No data in ' + fileName

        reviewNumber = fileName + '-' + splitData.pop(0)
        reviewString = splitData.pop(0) + ','
        reviewString += ''.join(splitData)
        reviewString = reviewString.replace('\n', '')
        reviewString = reviewString.replace(';', ',')

        if reviewNumber in completedReview.values():
            skippedIndex += 1
            continue

        if reviewString == '!e':
            continue
        resultStringList = ConvertNormalWord(reviewString, reviewString)

        resultString = '#'.join(resultStringList)
        insertQuery.append("""
        INSERT INTO article_dic (Article_Number, Article)
        VALUES ('""" + reviewNumber + """', '""" + resultString + """')""")

        for word in resultStringList:
            try:
                currentCount = wordDic[word]
            except:
                currentCount = 0

            newItem = {word: currentCount + 1}
            wordDic.update(newItem)

        completeIndex += 1

    AppendWordDicQuery(wordDic)

    returnString = "Complete building dictionary for " + fileName
    if skippedIndex > 0:
        returnString += ' (skipped ' + str(skippedIndex) + ' of ' + str(
            len(reviewData)) + ' review)'
    returnString += '\n'

    DataBaseManager.DoManyQuery(insertQuery,
                                title=title,
                                outPut=outPut + returnString,
                                queryType='INSERT')
    DataBaseManager.DoManyQuery(updateQuery,
                                title=title,
                                outPut=outPut + returnString,
                                queryType='UPDATE')

    return returnString