def create_laptop_test_data():
    '''
    Creates positive and negative test laptop data and saves it to final_laptop_data.csv
    '''
    
    file_path = 'data/test/final_laptop_test_data.csv'

    # Load the test laptop data
    laptop_df = pd.read_csv('data/base/retailer_test.csv')
    laptop_df['index'] = laptop_df.index
    laptop_df['index'].astype('object')
    
    if not os.path.exists(file_path):
        print('Generating test laptop data . . . ')

        # Create the negative and positive dataframes 
        neg_df = create_neg_laptop_test_data(laptop_df)
        pos_df = create_pos_laptop_test_data(laptop_df)
        
        # Concatenate the data and save it
        final_laptop_test_df = create_final_data(pos_df, neg_df)
        final_laptop_test_df = final_laptop_test_df.sample(frac=1)
        final_laptop_test_df.to_csv(file_path)

    else:
        print('Already have test laptop data. Moving on . . . ')
示例#2
0
def create_retailer_laptop_train_data():
    file_path = 'data/train/retailer_laptop_data.csv'
    
    if not os.path.exists(file_path):
        print('Generating Retailer Laptop train data . . .')
        # Get the laptop data from the different sources
        amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
        walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
        newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

        # Concatenate the data
        laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
        laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x, omit_punctuation=['.']))
        laptops = laptops.drop_duplicates(subset=['title'])

        # Create positive titles
        pos_titles = create_pos_laptop_data(laptops)
        pos_titles = pos_titles.drop_duplicates(subset=['title_one', 'title_two'])
        
        # Create negative titles
        neg_titles = create_neg_laptop_data(laptops)
        neg_titles = neg_titles.drop_duplicates(subset=['title_one', 'title_two'])

        # Combine the positive and negative DataFrames and put them in a CSV
        retailer_laptop_df = create_final_data(pos_titles, neg_titles)
        retailer_laptop_df.to_csv(file_path)
    
    else:
        print('Already have Retailer Laptop train data. Moving on . . .')
def create_spec_laptop_data():
    file_path = 'data/train/spec_train_data.csv'
    if not os.path.exists(file_path):
        print('Generating general spec data for laptops . . . ')
        populate_spec()
        if not os.path.exists('data/train/spec_data.csv'):
            print(
                'Generating spec data combinations. WARNING: THIS WILL CONSUME RESOURCES AND TAKE A LONG TIME.'
            )
            gen_spec_combos()
        spec_df = pd.read_csv('data/train/spec_data.csv')
        pos_df = create_pos_spec_data(spec_df,
                                      rm_attrs=[['company'], ['product'],
                                                ['screen'],
                                                ['product', 'screen'],
                                                ['company', 'screen']],
                                      add_attrs=[])
        neg_df = create_neg_spec_laptop(
            spec_df,
            ['cpu', 'ram', 'hard_drive', 'product', 'inches', 'screen'])
        final_spec_df = create_final_data(pos_df, neg_df)
        final_spec_df.to_csv(file_path)

    else:
        print('Already have spec data. Moving on . . .')
示例#4
0
def create_computer_gs_data():
    file_path = 'data/train/wdc_computers.csv'
    if not os.path.exists(file_path):
        print('Generating Gold Standard Computer data . . .')
        # Get the titles from the WDC Product Corpus
        if not os.path.exists('data/base/computer_wdc_whole_no_duplicates.csv'):
            computer_df = generate_computer_data()
            computer_df = computer_df.drop_duplicates('title')
            computer_df.to_csv('data/base/computer_wdc_whole_no_duplicates.csv')
        
        else:
            computer_df = pd.read_csv('data/base/computer_wdc_whole_no_duplicates.csv')
        
        # Get "good" clusters from the data
        valid_clusters = list(get_valid_clusters(computer_df))
        computer_train_wdc_pos = pd.DataFrame(columns=["title_one", "title_two", "label"])
        computer_train_wdc_neg = pd.DataFrame(columns=["title_one", "title_two", "label"])

        # Positive data creation
        for cluster in valid_clusters:
            computer_train_wdc_pos = computer_train_wdc_pos.append(create_pos_from_cluster(computer_df, cluster))

        # Negative data creation
        for cluster in valid_clusters:
            computer_train_wdc_neg = computer_train_wdc_neg.append(create_neg_from_cluster(computer_df, cluster, valid_clusters))

        # Concatenate the data
        computer_train_wdc = create_final_data(computer_train_wdc_pos, computer_train_wdc_neg)
        computer_train_wdc.to_csv('data/train/wdc_computers.csv')
    
    else:
        print('Already have Gold Standard Computer Data. Moving on . . .')
示例#5
0
def create_data():
    '''
    Runs the necessary functions to create the data for training.
    '''

    # Don't show the copy warnings
    pd.set_option('mode.chained_assignment', None)

    # Run the functions
    populate_spec()
    create_pcpartpicker_data()
    create_general_cpu_data()
    create_final_drive_data()
    create_pseudo_laptop_data()
    final_gb_data = create_final_data(gen_gb_pos_data(), gen_neg_gb_data())
    final_gb_data.reset_index(inplace=True)
    randomize_units(final_gb_data, units=['gb'])
    create_laptop_test_data()
    create_neg_laptop_test_data()
    create_retailer_laptop_train_data()
    create_computer_gs_data()

    print('Generating gigabyte data (as in just examples that use GB)')

    # Load all the data
    final_computer_df = pd.read_csv('data/train/wdc_computers.csv')
    final_pseudo_laptop_df = pd.read_csv('data/train/spec_train_data_new.csv')
    final_pcpartpicker_data = pd.read_csv(
        'data/train/final_pcpartpicker_data.csv').sample(frac=1)
    more_cpu_data = pd.read_csv('data/train/more_cpu_data.csv')
    more_drive_data = pd.read_csv('data/train/more_drive_data.csv')
    retailer_laptop_df = pd.read_csv('data/train/retailer_laptop_data.csv')
    all_data = [
        final_computer_df, final_pseudo_laptop_df, more_cpu_data,
        final_gb_data, more_drive_data, retailer_laptop_df
    ]

    # Print the sizes of the data
    print('Computer df size: {}'.format(len(final_computer_df)))
    print('Pseudo-Laptop df size: {}'.format(len(final_pseudo_laptop_df)))
    print('PCPartPicker df size: {}'.format(len(final_pcpartpicker_data)))
    print('More Drive Data df size: {}'.format(len(more_drive_data)))
    print('More CPU Data df size: {}'.format(len(more_cpu_data)))
    print('Final GB Data: {}'.format(len(final_gb_data)))
    print('Retailer Laptop Data: {}'.format(len(retailer_laptop_df)))

    # Concatenate everything
    total_data = pd.concat(all_data)
    total_data = total_data.sample(frac=1)
    total_data = remove_misc(total_data)

    # Get the max length of the data for padding in BERT
    Common.MAX_LEN = get_max_len(total_data)

    print('Total data size: {}'.format(len(total_data)))

    # Save the data
    total_data.to_csv('data/train/total_data.csv', index=False)
def create_pcpartpicker_data():
    '''
    Creates data for CPU, RAM, and drive data.
    Saves the data to final_pcpartpicker_data.csv
    '''

    file_path = 'data/train/final_pcpartpicker_data.csv'
    if not os.path.exists(file_path):
        print('Generating PCPartPicker data . . .')
        ram_df = remove_misc(pd.read_csv('data/base/pos_ram_titles.csv'))
        cpu_df = remove_misc(pd.read_csv('data/base/pos_cpu_titles.csv'))
        hard_drive_df = remove_misc(
            pd.read_csv('data/base/pos_hard_drive_titles.csv'))

        # Generate all the positive data for the categories
        pos_ram_data = generate_pos_pcpartpicker_data(ram_df)
        pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df)
        pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df)

        # Generate all the negative data for the categories
        neg_ram_data = generate_neg_pcpartpicker_data(ram_df)
        neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df)
        neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df)

        # Generate the final data
        final_ram_data = create_final_data(pos_ram_data, neg_ram_data)
        final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data)
        final_hard_drive_data = create_final_data(pos_hard_drive_data,
                                                  neg_hard_drive_data)

        print('Amount of data for the CPU data, RAM data and drive data',
              len(final_cpu_data), len(final_ram_data),
              len(final_hard_drive_data))

        # Concatenate the data and save it
        final_pcpartpicker_df = pd.concat(
            [final_ram_data, final_cpu_data, final_hard_drive_data])
        final_pcpartpicker_df.reset_index(inplace=True)
        randomize_units(final_pcpartpicker_df, units=['gb'])
        final_pcpartpicker_df.to_csv(file_path)

    else:
        print('Already have PCPartPicker data. Moving on . . .')
示例#7
0
def create_pcpartpicker_data():
    ram_df = remove_misc(pd.read_csv('data/train/pos_ram_titles.csv'))
    cpu_df = remove_misc(pd.read_csv('data/train/pos_cpu_titles.csv'))
    hard_drive_df = remove_misc(pd.read_csv('data/train/pos_hard_drive_titles.csv'))

    # Generate all the positive data for the categories
    pos_ram_data = generate_pos_pcpartpicker_data(ram_df)
    pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df)
    pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df)

    # Generate all the negative data for the categories
    neg_ram_data = generate_neg_pcpartpicker_data(ram_df)
    neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df)
    neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df)

    # Generate the final data
    final_ram_data = create_final_data(pos_ram_data, neg_ram_data)
    final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data)
    final_hard_drive_data = create_final_data(pos_hard_drive_data, neg_hard_drive_data)

    print('Amount of data for the CPU data, RAM data and hard drive data', len(final_cpu_data), len(final_ram_data), len(final_hard_drive_data))
    return final_cpu_data, final_ram_data, final_hard_drive_data
def create_final_drive_data():
    file_path = 'data/train/more_drive_data.csv'
    if not os.path.exists(file_path):
        print('Generating general drive data . . . ')
        # Generate the data
        pos_df = generate_pos_hard_drive_data()
        neg_df = generate_neg_hard_drive_data()

        # Concatenate the data and save it
        final_df = create_final_data(pos_df, neg_df)
        final_df.to_csv(file_path)

    else:
        print('Already have general drive data. Moving on . . .')
示例#9
0
def create_general_cpu_data():
    file_path = 'data/train/more_cpu_data.csv'
    if not os.path.exists(file_path):
        print('Generating general cpu data . . . ')
        # Create the positive and negative examples
        pos_df = generate_pos_cpu_data()
        neg_df = generate_neg_cpu_data()

        # Concatenate the data and save it
        final_cpu_df = create_final_data(pos_df, neg_df)
        final_cpu_df.to_csv(file_path)

    else:
        print('Already have general cpu data data. Moving on . . .')
示例#10
0
def create_laptop_data():
    # Load the laptop data
    laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')
    create_attribute_sets(laptop_df)

    neg_df = create_neg_laptop_data(
        laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])
    pos_df = create_pos_laptop_data(laptop_df,
                                    rm_attrs=[['Company'], ['TypeName'],
                                              ['ScreenResolution'],
                                              ['Product'],
                                              ['TypeName',
                                               'ScreenResolution']],
                                    add_attrs=[])
    final_laptop_df = create_final_data(pos_df, neg_df)
    final_laptop_df = final_laptop_df.sample(frac=1)
    return final_laptop_df
示例#11
0
def create_final_drive_data():
    '''
    Creates positive and negative drive data and saves it to more_drive_data.csv
    '''

    file_path = 'data/train/more_drive_data.csv'
    if not os.path.exists(file_path):
        print('Generating general drive data . . . ')
        # Generate the data
        pos_df = generate_pos_hard_drive_data()
        neg_df = generate_neg_hard_drive_data()

        # Concatenate the data and save it
        final_df = create_final_data(pos_df, neg_df)
        final_df.reset_index(inplace=True)
        randomize_units(final_df, ['gb'])
        final_df.to_csv(file_path)

    else:
        print('Already have general drive data. Moving on . . .')
def create_laptop_data():
    file_path = 'data/train/final_laptop_data.csv'
    # Load the laptop data
    laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')
    
    # Create the attribute sets for the LaptopAttributes
    create_attribute_sets(laptop_df)
    
    if not os.path.exists(file_path):
        print('Generating laptop data . . . ')
        # Create the negative and positive dataframes 
        neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])
        pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs = [])
        
        # Concatenate the data and save it
        final_laptop_df = create_final_data(pos_df, neg_df)
        final_laptop_df = final_laptop_df.sample(frac=1)
        final_laptop_df.to_csv(file_path)

    else:
        print('Already have laptop data. Moving on . . . ')