Exemplo n.º 1
0
def calculate_TF_IDF():
    """Calculates the term frequency and inverse document frequency of
    the keywords and store them in the database tables.
    """
    conn = util.getDBConnection()
    sql = "select word from clean_keywords"
    print sql
    rows = util.executeSQL(conn, sql)
    word_tf = {}
    word_df = {}
    for row in rows:
        word = row[0]
        sql1 = "select doc_id from keywords where name='" + word + "'"
        print sql1
        res = util.executeSQL(conn, sql1)
        for row1 in res:
            pkg_id = row1[0]
            key = word + ':' + str(pkg_id)
            if key in word_tf:
                tf_count = word_tf[key]
                word_tf[key] = tf_count + 1
            else:
                word_tf[key] = 1
                if word in word_df:
                    df_count = word_df[word]
                    word_df[word] = df_count + 1
                else:
                    word_df[word] = 1

    for word, df in word_df.iteritems():
        sql = 'update clean_keywords set df=' + str(
            df) + " where word='" + word + "'"
        print sql
        util.executeSQL(conn, sql)

    for word_pkgid, tf in word_tf.iteritems():
        word, pkg_id = word_pkgid.split(":")
        sql = 'update keywords set tf=' + str(
            tf) + " where name='" + word + "' and doc_id=" + str(pkg_id)
        print sql
        util.executeSQL(conn, sql)
Exemplo n.º 2
0
def calculate_TF_IDF():
    """Calculates the term frequency and inverse document frequency of
    the keywords and store them in the database tables.
    """
    conn = util.getDBConnection()
    sql = "select word from clean_keywords"
    print sql
    rows = util.executeSQL(conn, sql)
    word_tf = {}
    word_df = {}
    for row in rows:
        word = row[0]
        sql1 = "select doc_id from keywords where name='" + word + "'"
        print sql1
        res = util.executeSQL(conn, sql1)
        for row1 in res:
            pkg_id = row1[0]
            key = word + ':' + str(pkg_id)
            if key in word_tf:
                tf_count = word_tf[key]
                word_tf[key] = tf_count + 1
            else:
                word_tf[key] = 1
                if word in word_df:
                    df_count = word_df[word]
                    word_df[word] = df_count + 1
                else:
                    word_df[word] = 1

    for word, df in word_df.iteritems():
        sql = 'update clean_keywords set df=' + str(df) + " where word='" + word + "'"
        print sql
        util.executeSQL(conn, sql)

    for word_pkgid, tf in word_tf.iteritems():
        word, pkg_id = word_pkgid.split(":")
        sql = 'update keywords set tf=' + str(tf) + " where name='" + word + "' and doc_id=" + str(pkg_id)
        print sql
        util.executeSQL(conn, sql)
Exemplo n.º 3
0
        print sql
        util.executeSQL(conn, sql)

    for word_pkgid, tf in word_tf.iteritems():
        word, pkg_id = word_pkgid.split(":")
        sql = 'update keywords set tf=' + str(
            tf) + " where name='" + word + "' and doc_id=" + str(pkg_id)
        print sql
        util.executeSQL(conn, sql)


if __name__ == '__main__':
    try:
        kwd_index = 1
        pkg_id = 1
        conn = util.getDBConnection()
        delete_table_data()  # delete the existing data
        # @todo(Argparse)
        for _dir in os.listdir(constants.PATH):
            insert_package(_dir, pkg_id)
            # @todo(Logging)
            print _dir
            _files = get_package_files(os.path.join(constants.PATH, _dir))
            for root, _file in _files:
                process_file(root, _file, pkg_id)

            if has_enough_keywords(pkg_id):
                pkg_id += 1
        populate_clean_keywords()
        calculate_TF_IDF()
    except Exception as e:
Exemplo n.º 4
0
        sql = 'update clean_keywords set df=' + str(df) + " where word='" + word + "'"
        print sql
        util.executeSQL(conn, sql)

    for word_pkgid, tf in word_tf.iteritems():
        word, pkg_id = word_pkgid.split(":")
        sql = 'update keywords set tf=' + str(tf) + " where name='" + word + "' and doc_id=" + str(pkg_id)
        print sql
        util.executeSQL(conn, sql)


if __name__ == '__main__':
    try:
        kwd_index = 1
        pkg_id = 1
        conn = util.getDBConnection()
        delete_table_data()  # delete the existing data
        # @todo(Argparse)
        for _dir in os.listdir(constants.PATH):
            insert_package(_dir, pkg_id)
            # @todo(Logging)
            print _dir
            _files = get_package_files(os.path.join(constants.PATH, _dir))
            for root, _file in _files:
                process_file(root, _file, pkg_id)
 
            if has_enough_keywords(pkg_id):
                pkg_id += 1
        populate_clean_keywords()
        calculate_TF_IDF()
    except Exception as e: