Exemplo n.º 1
0
def data_cleanup():
    """
    Cleans the dataset by removing 1/10 of the wines, sorted by wine rarity.
    """
    # file locations
    csv_path = "../data/Wine/winemag-data-130k-v2.csv"
    output_path = "../data/Wine/wine_clean.csv"
    wine_count = "../data/Wine/varietals.csv"

    # remove 10% of the rarest wines in the list
    count_data = util.load(wine_count, [])[0]
    counts, varietals = count_data
    counts = [int(count) for count in counts]
    remove_pct = 0.1
    remove_num = remove_pct * sum(counts)
    partial_sum = 0
    idx = 0
    while partial_sum < remove_num:
        partial_sum += counts[idx]
        idx += 1
    wines_to_keep = varietals[idx:]

    # create flags for the data cleaning function
    flags = dict()
    flags["Top Wines"] = wines_to_keep
    # can change these column indices
    flags["Special Chars"] = [2, 3, 6, 7, 8, 11, 12]
    util.clean_data(csv_path, output_path, flags)
Exemplo n.º 2
0
def read_dataset(clip_len):
    # Read dataset
    test_dataset = get_data('test.csv')
    classes_list = get_classes(test_dataset)
    print('Number of classes:', len(classes_list))
    print('Test set:', len(test_dataset))
    test_dataset = clean_data(test_dataset,
                              clip_len + 1,
                              classes=classes_list,
                              MAX_FRAMES=3000)
    print('Test set after clean:', len(test_dataset))
    return test_dataset, classes_list
Exemplo n.º 3
0
def index_message():
    filter_words = load_filter_words()
    message_loaded_nodes = pickle.load(open(SETTINGS.message_object_file,
                                            "rb"))
    sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/message/")
    docs = []
    for key, value in message_loaded_nodes.iteritems():

        #Checking if the subject or body contains filter words (non-compliant words)
        compliantFlag = True

        #NoneType check
        if value._subject == None:
            text = value._body
        elif value._body == None:
            text = value._subject
        else:
            text = value._subject + value._body

        if is_filter_word_present(text, filter_words):
            compliantFlag = False

        doc = {
            "nodeId": key,
            "datetime": value._datetime,
            "epochSecs": value._epoch_secs,
            "subject": value._subject,
            "body": clean_data(value._body),
            "emailId": value._email_id,
            "compliantFlag": compliantFlag
        }
        #         doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":value._body, "emailId":value._email_id,"compliantFlag":compliantFlag}
        docs.append(doc)

    sPerson.add(docs)
    sPerson.commit()
Exemplo n.º 4
0
import util
import config

start_mark = '"text":'


def preprocess_wiki(wiki_folder):
    allfiles = util.absoluteFilePaths(wiki_folder)

    wikis = []
    for path in allfiles:

        fin = open(path, encoding='utf-8', errors='ignore')
        lines = fin.readlines()
        fin.close()

        for line in lines:
            linestr = line.strip()
            l = linestr.index(start_mark) + len(start_mark) + 2
            txt = linestr[l:-4].strip()
            txt = txt.replace('\\n', '')
            txt = ''.join(txt.splitlines())
            if len(txt) > 1:
                wikis.append(txt)
    return wikis


if __name__ == '__main__':
    wikis = preprocess_wiki(config.wiki_folder)
    util.clean_data(wikis, config.wiki_file, ' ')
Exemplo n.º 5
0
#!/usr/bin/env python

import csv
import xlrd

from util import make_headers, clean_data

IMPORT_FILES = [
    'src/Alaska_Louisiana.xls', 'src/Massachussetts_Wyoming_Territories.xls'
]

if __name__ == "__main__":
    for i, filename in enumerate(IMPORT_FILES):
        workbook = xlrd.open_workbook(filename)
        datemode = workbook.datemode
        worksheets = workbook.sheet_names()

        if i == 0:
            headers = make_headers(workbook.sheet_by_name(worksheets[0]))
            headers['federal_supply_class'] = 'federal_supply_class'
            headers['federal_supply_category'] = 'federal_supply_category'
            f = open("src/leso.csv", "w")
            writer = csv.DictWriter(f, fieldnames=headers.values())
            writer.writeheader()

        for worksheet in worksheets:
            sheet = workbook.sheet_by_name(worksheet)
            clean_data(sheet, writer, headers, datemode)
Exemplo n.º 6
0
if (disciplina > 0):
    df_s = df_s.loc[(df_s['CodigoDisciplina'] == disciplina)]
    df_t = df_t.loc[(df_t['CodigoDisciplina'] == disciplina)]

if (len(periodo_letivo_source) > 0):
    df_s = df_s.loc[(df_s['PeriodoLetivo'].isin(periodo_letivo_source))]

if (len(periodo_letivo_test) > 0):
    df_t = df_t.loc[(df_t['PeriodoLetivo'].isin(periodo_letivo_test))]

df_s = df_s.reset_index(drop=True)
df_t = df_t.reset_index(drop=True)

df_s_filter = util.clean_data(df_s,
                              standardize=True,
                              plot_cov=False,
                              title='Matriz de Covariância - ' +
                              disciplinas[disciplina] + ' / ' + s_periodo)
df_t_filter = util.clean_data(df_t,
                              standardize=True,
                              plot_cov=False,
                              title='Matriz de Covariância - ' +
                              disciplinas[disciplina] + ' / ' + t_periodo)

print('Registros source: ' + str(len(df_s_filter)))
print('Registros target: ' + str(len(df_t_filter)))

#df_s_std = util.correlation_alignment(df_s_filter, df_t_filter,1)
df_s_std = df_s_filter

#Embaralha dataframe normalizado
Exemplo n.º 7
0
def index_message():    
    filter_words = load_filter_words()
    message_loaded_nodes = pickle.load(open(SETTINGS.message_object_file, "rb"))
    sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/message/")
    docs = []
    for key, value in message_loaded_nodes.iteritems():
        
        #Checking if the subject or body contains filter words (non-compliant words)
        compliantFlag= True
        
        #NoneType check
        if value._subject == None:
            text = value._body
        elif value._body == None:
            text = value._subject
        else:
            text = value._subject + value._body
                    
        if is_filter_word_present(text, filter_words):
            compliantFlag = False 

        doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":clean_data(value._body), "emailId":value._email_id,"compliantFlag":compliantFlag}
#         doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":value._body, "emailId":value._email_id,"compliantFlag":compliantFlag}
        docs.append(doc)
        
    sPerson.add(docs)
    sPerson.commit()    
Exemplo n.º 8
0
plot_var_cov = True

modulo = '3'
s_disciplina = 'logica'
#s_disciplina = 'mat_adm'

df_s = pd.read_csv('../Week 3/m' + modulo + '_' + s_disciplina +
                   '_ext_2012_01.csv',
                   sep=',')
df_t = pd.read_csv('../Week 3/m' + modulo + '_' + s_disciplina +
                   '_ext_2012_02_2014_01.csv',
                   sep=',')

#Limpa e organiza algumas features e normaliza com z-score
df_s_std = util.clean_data(df_s,
                           normalizar,
                           plot_cov=False,
                           title="Clean Data - Covariancia (Ds)")
df_t_std = util.clean_data(df_t,
                           normalizar,
                           plot_cov=False,
                           title="Clean Data - Covariancia (Dt)")

#df_s_std = util.correlation_alignment(df_s_std, df_t_std,1)

#Embaralha dataframe normalizado
df_normalized = shuffle(df_s_std)
df_t_normalized = shuffle(df_t_std)

cm_final = algoritmos.predict_decision_tree(
    df_normalized, df_t_normalized)  #, group_fold_column='CodigoTurma')
Exemplo n.º 9
0
@author: Zhenlin
"""
import util
import config

def preprocess_sohu_news(sohu_path):
    fin=open(sohu_path,encoding='utf-8',errors='ignore')
    lines=fin.readlines()
    fin.close()
    
    mark1='<content>'
    mark2='</content>'
    
    news=[]
    for i in range(4,len(lines),6):
        content=lines[i].strip()[len(mark1):-len(mark2)]
        content=content.strip()
        if len(content)>1:
            news.append(content)
    return news

if __name__ == '__main__':
    sohu_path=r'/root/bytecamp2019/datasets/news_sohusite_xml-utf8.dat'
    sohu_clean=r'/root/bytecamp2019/datasets/news_sohusite_clean-utf8.dat'
    sohu_clean_space=r'/root/bytecamp2019/datasets/news_sohusite_clean_space-utf8.dat'
    news=preprocess_sohu_news(sohu_path)
    util.clean_data(news,sohu_clean)
    util.clean_data(news,sohu_clean_space,' ')
    
Exemplo n.º 10
0
"""
import util
import config
import json

start_mark = '"text":'


def preprocess_webtext(text_path):

    fin = open(text_path, encoding='utf-8', errors='ignore')
    lines = fin.readlines()
    fin.close()

    answers = []
    for line in lines:
        linestr = line.strip()
        contents = json.loads(linestr)

        if "content" in contents:
            ans = contents["content"].replace('\\n', '').replace('\\r', '')
            ans = ''.join(ans.splitlines())
            if len(ans) > 1:
                answers.append(ans)
    return answers


if __name__ == '__main__':
    answers = preprocess_webtext(config.webtext_path)
    util.clean_data(answers, config.webtext_clean_path, ' ')
Exemplo n.º 11
0
#!/usr/bin/env python

import csv
import xlrd

from util import make_headers, clean_data

IMPORT_FILES = ['src/Alaska_Louisiana.xls', 'src/Massachussetts_Wyoming_Territories.xls']

if __name__ == "__main__":
    for i, filename in enumerate(IMPORT_FILES):
        workbook = xlrd.open_workbook(filename)
        datemode = workbook.datemode
        worksheets = workbook.sheet_names()

        if i == 0:
            headers = make_headers(workbook.sheet_by_name(worksheets[0]))
            headers['federal_supply_class'] = 'federal_supply_class'
            headers['federal_supply_category'] = 'federal_supply_category'
            f = open("src/leso.csv", "w")
            writer = csv.DictWriter(f, fieldnames=headers.values())
            writer.writeheader()

        for worksheet in worksheets:
            sheet = workbook.sheet_by_name(worksheet)
            clean_data(sheet, writer, headers, datemode)
Exemplo n.º 12
0
def main():
    args = parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = str(
        args.gpu)  # Choose GPU for training

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.compat.v1.InteractiveSession(config=config)

    input_shape = (args.clip_len, args.crop_size, args.crop_size, 3)
    model_name = args.model
    reg_factor = args.reg_factor
    batch_size = args.batch_size
    epochs = args.epochs
    lr_init = args.lr
    start_epoch = args.start_epoch
    save_path = args.save_path
    temperature = args.temperature
    alpha = args.lambd
    drop_rate = args.drop_rate
    every = 1

    # Create folders for callback
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    if not os.path.exists(os.path.join(save_path, "output")):
        os.mkdir(os.path.join(save_path, "output"))
    if not os.path.exists(os.path.join(save_path, "checkpoints")):
        os.mkdir(os.path.join(save_path, "checkpoints"))

    # Write all config to file
    f = open(os.path.join(save_path, 'config.txt'), "w")
    f.write('input shape: ' + str(input_shape) + '\n')
    f.write('model name: ' + model_name + '\n')
    f.write('reg factor: ' + str(reg_factor) + '\n')
    f.write('batch size: ' + str(batch_size) + '\n')
    f.write('numbers of epochs: ' + str(epochs) + '\n')
    f.write('lr init: ' + str(lr_init) + '\n')
    f.write('Temperature: ' + str(temperature) + '\n')
    f.write('Alpha: ' + str(alpha) + '\n')
    f.write('start epoch: ' + str(start_epoch) + '\n')
    f.write('Drop rate: ' + str(drop_rate) + '\n')
    f.close()

    # Read dataset
    train_dataset = get_data('train.csv')
    val_dataset = get_data('val.csv')
    classes_list = get_classes(train_dataset)
    print('Number of classes:', len(classes_list))
    print('Train set:', len(train_dataset))
    print('Val set:', len(val_dataset))

    weight_model_path = os.path.join(save_path, "best_" + model_name + "_.h5")

    train_dataset = clean_data(train_dataset,
                               args.clip_len + 1,
                               classes=classes_list,
                               MAX_FRAMES=3000)
    val_dataset = clean_data(val_dataset,
                             args.clip_len + 1,
                             classes=classes_list,
                             MAX_FRAMES=3000)
    print('Train set after clean:', len(train_dataset))
    print('Val set after clean:', len(val_dataset))

    # --------------------------------------Continuous training with Self Knowledge Distillation----------------------------------------
    train_self_KD(train_dataset,
                  val_dataset,
                  model_name,
                  input_shape,
                  classes_list,
                  lr_init,
                  weight_model_path,
                  start_epoch=start_epoch,
                  reg_factor=reg_factor,
                  save_path=save_path,
                  alpha=alpha,
                  temperature=temperature,
                  batch_size=batch_size,
                  every=every,
                  epochs=epochs,
                  drop_rate=drop_rate)