Exemplo n.º 1
0
    def process(self):
        global table_id, typ, FILE_TYPE, n_data
        print "Running %s -> Matching pickle files to be created..." % self.run_type

        if self.run_type == "test":
            FILE_TYPE = "Match"  # Train or Match
            n_data = 52
            typ = 'ts_'
            table_id = 0
        elif self.run_type == "train":
            n_data = 1644
            typ = 'tr_'
            FILE_TYPE = "Train"
            table_id = 1

        conn = db.connect("dbname='postgres' user='******'")
        curs = conn.cursor()

        curs.execute(
            "SELECT table_name FROM information_schema.tables WHERE table_schema='public' ;"
        )

        tab = curs.fetchall()
        print "Tables: %s" % tab

        table_nam = ''.join(tab[table_id])
        print " Table names --%s" % table_nam

        curs = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

        curs.execute('Select * from ' + table_nam + ' limit 20;')

        descr = curs.description  # description about the column attributes

        attributes = {}
        for i in range(0, len(descr)):
            attributes[descr[i][0]] = []

            if descr[i][1] in FLOATISH_TYPES:
                attributes[descr[i][0]].append(
                    0)  # storing type - real/float/numeric of attribute

            if descr[i][1] in INT_TYPES:
                attributes[descr[i][0]].append(
                    1)  # storing type - int of attribute

            if descr[i][1] in CHAR_TYPES:
                attributes[descr[i][0]].append(
                    2)  # storing type - char of attribute

            if descr[i][1] in BOOLEAN_TYPES:
                attributes[descr[i][0]].append(
                    3)  # storing type - boolean of attribute

            if descr[i][1] in DATE_TYPES:
                attributes[descr[i][0]].append(
                    4)  # storing type - date of attribute

            if descr[i][1] in TIME_TYPES:
                attributes[descr[i][0]].append(
                    5)  # storing type - time of attribute

            attributes[descr[i][0]].append(
                descr[i][3])  # storing length of attribute

        debug("Printing attributes ---")
        for key, val in attributes.items():
            debug("%s=>%s" % (key, val))

        # This command is giving the list of constraints - Keys, Unique, check on all the columns of a table.
        curs.execute(
            "SELECT tc.constraint_type, tc.table_name, kcu.column_name " +
            "FROM information_schema.table_constraints tc " +
            "LEFT JOIN information_schema.key_column_usage kcu " +
            "ON tc.constraint_catalog = kcu.constraint_catalog " +
            "AND tc.constraint_schema = kcu.constraint_schema " +
            "AND tc.constraint_name = kcu.constraint_name " +
            "WHERE tc.table_name='" + table_nam + "';")

        modifiers_column = curs.fetchall()

        # This command is extracting keys and unique from the above list.
        constraint_column = []
        for i in range(0, len(modifiers_column)):
            if modifiers_column[i][0] == 'PRIMARY KEY' or modifiers_column[i][0] == 'FOREIGN KEY' or \
                    modifiers_column[i][
                        0] == 'UNIQUE':
                constraint_column.append(modifiers_column[i])

        debug("Constraint columns ---")
        debug(constraint_column)

        # Append 0 for Primary and Foreign Key , then update as we find the column in the list.
        for i in range(0, len(descr)):
            attributes[descr[i][0]].append(0)

        for i in range(0, len(constraint_column)):
            if constraint_column[i][0] in keys:
                attributes[constraint_column[i][2]][
                    -1] = 1  # storing Primary / Foreign Key of attribute

        # Append 0 for Unique Constraint , then update as we find the column in the list.
        for i in range(0, len(descr)):
            attributes[descr[i][0]].append(0)

        for i in range(0, len(constraint_column)):
            if constraint_column[i][0] == 'UNIQUE':
                attributes[constraint_column[i][2]][
                    -1] = 1  # storing Unique constraint of attribute

        # This command is giving the list of nullable constraints on all the columns of a table.
        curs.execute(
            "select column_name, IS_NULLABLE from INFORMATION_SCHEMA.COLUMNS where table_name ='"
            + table_nam + "';")
        Null_attributes = curs.fetchall()

        # Append 0 for Not_Null attribute, then update as we find the column in the list.
        for i in range(0, len(descr)):
            attributes[descr[i][0]].append(0)

        for i in range(0, len(Null_attributes)):
            if Null_attributes[i][1] == 'NO':
                attributes[Null_attributes[i][0]][
                    -1] = 1  # storing Not-null constraint of attribute   - it cannot be null

        curs = conn.cursor()

        for i in range(0, len(descr)):
            curs.execute('Select ' + descr[i][0] + ' from ' + table_nam + ';')
            # data = curs.fetchall()
            data = list(zip(*curs.fetchall())[0])
            # print type(data)
            temp = []
            # Converting the date format to mm/dd/yyyy
            if attributes[descr[i][0]][0] == 4:
                for j in range(0, len(data)):
                    temp.append(data[j].strftime('%m/%d/%Y'))

                data = temp
            fixed_length = attributes[descr[i][0]][1]
            attributes[descr[i][0]].append(
                float(fe.averageusedlength(data, n_data, fixed_length)))
            debug('average used length - %.2f' %
                  fe.averageusedlength(data, n_data, fixed_length))

            attributes[descr[i][0]].append(
                fe.varianceoflength(data, fixed_length))
            attributes[descr[i][0]].append(fe.varianceCoefflength(data))
            if attributes[descr[i][0]][0] == 1 or attributes[descr[i]
                                                             [0]][0] == 0:
                number_features = list(
                    fe.numFeatures(data, fixed_length, n_data))
                for values in number_features:
                    attributes[descr[i][0]].append(values)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
            else:
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(0.0)
                attributes[descr[i][0]].append(fe.WhiteSpaceFeature(data))
                attributes[descr[i][0]].append(fe.specialChars(data))
                attributes[descr[i][0]].append(fe.NumbertoAll(data))
                attributes[descr[i][0]].append(fe.ChartoAll(data))
                attributes[descr[i][0]].append(fe.Numberofbackslash(data))
                attributes[descr[i][0]].append(fe.Numberofbrackets(data))
                attributes[descr[i][0]].append(fe.Numberofhyphen(data))

        debug("Attributes of state-----")
        debug(attributes['state'])
        debug("END attrib state ------")

        new_attributes = {}
        for k in attributes.keys():
            key_val = typ + k
            new_attributes[key_val] = attributes[k]

        file_name = "../Feature_Vectors/DataFeatures_" + FILE_TYPE + ".pickle"
        log("Saving features file:%s" % file_name)
        pickle.dump(new_attributes, open(file_name, 'wb'))

        print("Total attributes:%d" % len(new_attributes))
Exemplo n.º 2
0
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
    else:
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(0.0)
        attributes[descr[i][0]].append(fe.WhiteSpaceFeature(data))
        attributes[descr[i][0]].append(fe.specialChars(data))
        attributes[descr[i][0]].append(fe.NumbertoAll(data))
        attributes[descr[i][0]].append(fe.ChartoAll(data))
        attributes[descr[i][0]].append(fe.Numberofbackslash(data))
        attributes[descr[i][0]].append(fe.Numberofbrackets(data))
        attributes[descr[i][0]].append(fe.Numberofhyphen(data))

# for key,values in attributes.items():
# print key,values

print "Attributes of state-----"
print(attributes['state'])
print "END attrib state ------"

new_attributes = {}
for k in attributes.keys():
    key_val = typ + k
    new_attributes[key_val] = attributes[k]