Пример #1
0
def test_from_pickle():
    setup_pickle_file(SMALL_ROW_SIZE)

    pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME)
    modin_df = pd.read_pickle(TEST_PICKLE_FILENAME)

    assert modin_df_equals_pandas(modin_df, pandas_df)

    teardown_pickle_file()
Пример #2
0
    """ With multiprocessing using Dask"""
    print("Recommending")
    topn = 500
    sleep(0.2)
    vec_bow = id2word_dictionary.doc2bow(context_list)
    # This line takes a LONG time: it has to map to each of the 300 topics
    vec_ldamallet = ldamallet[vec_bow]
    # Convert the query to LDA space
    sims = malletindex[vec_ldamallet]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])[:topn]
    # sims is a list of tuples of (docid -- line num in original training file, probability)
    return [docid_to_magid.get(docid) for docid, prob in sims] 

#df = pd.read_pickle('recommendationsids.pickle')
'''df = pd.read_pickle('/home/ashwath/Programs/ACLAAn/Pickles/recommendations_aclmag_3models_500_df.pickle')
malletindex = similarities.MatrixSimilarity.load('/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index')
with open('/home/ashwath/Programs/ACLAAn/LDA/docid_to_magid_training_acl.pickle', 'rb') as pick:
    docid_to_magid = pickle.load(pick)
id2word_dictionary = corpora.Dictionary.load('/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict')
corpus = corpora.MmCorpus('/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm')

ldamallet = LdaMallet.load('/home/ashwath/Programs/ACLAAn/LDA/lda_model.model')

df['lda_recommendations'] = df['context_for_lda'].progress_apply(lda_recommend)
df['lda_binary'] = df[['ground_truth', 'lda_recommendations']].apply(
        lambda x: binarize_predictions(x.ground_truth, x.lda_recommendations), axis=1)
df.to_pickle('/home/ashwath/Programs/ACLAAn/Pickles/malletrecommendations_aclmag_500_df.pickle')
df.to_csv('/home/ashwath/Programs/ACLAAn/Evaluation/malletrecommendations_aclmag_500.tsv', sep='\t')'''
sleep(0.3)
df = pd.read_pickle('/home/ashwath/Programs/ACLAAn/Pickles/malletrecommendations_aclmag_500_df.pickle')
calculate_metrics(df[['lda_recommendations', 'lda_binary', 'ground_truth']])
Пример #3
0
if get_data:
    #function format(propertyTypes,minBedrooms,minBathrooms,minCarspaces,minPrice,maxPrice,minLandArea,state,region,area,suburb,includeSurroundingSuburbs)
    #dont use zeros here yet needs to be fixed
    df = domain().listing_results(["House"],2,1,1,500000,550000,10,"VIC","","","",False)

    print("Main frame shape: ",df.shape)

    df.to_csv('data/{version}_data.csv'.format(version=version), sep='\t', encoding='utf-8')
    df.to_pickle('data/{version}_data.pkl'.format(version=version))
    #dup_df_2 = df[df['id'].duplicated() == True]
    #dup_df_2 = dup_df_2.sort_values(by=['id'])
    #r, c = dup_df_2.shape
    #if r > 0:
    #    print("duplicates in df")
else:
    df = pd.read_pickle(static_pkl)

if compute_features:
    features = processing.compute_features(df)
    print(features)
    '''['BuiltInWardrobes', 'SecureParking', 'AirConditioning', 'Ensuite', 'Gas', 'Heating', 'Dishwasher', 'BalconyDeck', 'InternalLaundry', 'PetsAllowed', 'Bath', 
    'Study', 'FullyFenced', 'Floorboards', 'BroadbandInternetAccess', 'GardenCourtyard', 'AlarmSystem', 'Shed', 'Gym', 'Intercom', 'SolarPanels', 'WaterViews', 
    'Furnished', 'NorthFacing', 'SwimmingPool', 'RainwaterStorageTank', 'CableOrSatellite', 'GroundFloor', 'SolarHotWater', 'TennisCourt', 'OutdoorSpa', 
    'DoubleGlazedWindows', 'WallCeilingInsulation', 'SeparateDiningRoom', 'IndoorSpa']'''
else:
    pass

df = processing.feature_score(df,feature_ranking)
print(df)
df.to_csv('data/{version}_data_featurescored.csv'.format(version=version), sep='\t', encoding='utf-8')
Пример #4
0
def predict(filename):
    """ Get the recommendations using the 3 methods and put them in a dataframe"""
    '''    df = pd.read_csv(filename, sep='\t', names=['ground_truth', 'citing_acl_id', 'context'])
    print("Read file")
    #df = df.head()
    # Convert cited mag ids to a list
    df['ground_truth'] = df['ground_truth'].astype('str').apply(lambda x: x.split(','))
    df['context_for_lda'] = df['context'].apply(lda_preprocessing)
    print("Created lda contexts")
    sleep(0.3)
    # clean_text is present in hyperdoc2vec
    df['context'] = df['context'].apply(clean_text)
    print('Cleaned contexts')
    sleep(0.3)
    df['wordcount'] = df['context'].apply(lambda x: len(x.split()))
    # Remove contexts with less than 8 words
    df = df[df.wordcount>8]
    df['hd2v_recommendations'] = df['context'].apply(hd2v_recommend)
    print('hd2v recommendations done')
    sleep(0.3)
    df['bm25_recommendations'] = df['context'].apply(solr_recommend)
    sleep(0.3)
    print('solr recommendations done')
    # LDA Recommendations take a long time, parallelize
    #ddask = dd.from_pandas(df, npartitions=64)
    #ddask['lda_recommendations'] = ddask['context_for_lda'].apply(lda_recommend)
    df['lda_recommendations'] = df['context_for_lda'].apply(lda_recommend)
    print('lda recommendations done')  '''
    df = pd.read_pickle('recommendationsids.pickle')
    print('read pickle')
    #df.to_pickle('recommendationsids.pickle')

    #sleep(0.3)
    df['hd2v_binary'] = df[['ground_truth', 'hd2v_recommendations']].apply(
        lambda x: binarize_predictions(x.ground_truth, x.hd2v_recommendations),
        axis=1)
    print(df.hd2v_binary)
    sleep(0.3)
    df['lda_binary'] = df[['ground_truth', 'lda_recommendations']].apply(
        lambda x: binarize_predictions(x.ground_truth, x.lda_recommendations),
        axis=1)
    sleep(0.3)
    df['bm25_binary'] = df[['ground_truth', 'bm25_recommendations']].apply(
        lambda x: binarize_predictions(x.ground_truth, x.bm25_recommendations),
        axis=1)
    # NOTE: for 500 recommendations, ground truth is present in hd2v's recommendations 909 times, lda 1146 times, bm25 1543 times.
    # (total: 2819) -- 32.24%, 40.65%, 54.74%
    #  t = df.bm25_binary.apply(lambda x: 1 in x or 2 in x)
    print("Binarized")
    df.to_pickle(
        '/home/ashwath/Programs/ACLAAn/Pickles/recommendations_aclmag_3models_500_df.pickle'
    )
    df.to_csv(
        '/home/ashwath/Programs/ACLAAn/Evaluation/recommendations_aclmag_3models_500.tsv',
        sep='\t')

    print("Prediction done")
    calculate_metrics(df[[
        'hd2v_recommendations', 'lda_recommendations', 'bm25_recommendations',
        'hd2v_binary', 'lda_binary', 'bm25_binary', 'ground_truth'
    ]])
Пример #5
0
        '/home/ashwath/Programs/MAGCS/Pickles/paperwisemetrics_magLDA_3models_df.pickle'
    )
    print("METRICS CALCULATED, time to calculate the means")
    # Get the mean of all the index columns
    # First, drop list columns.
    df = df.drop(['lda_recommendations', 'lda_binary', 'ground_truth'], axis=1)
    mean_series = df.mean()
    mean_series.to_csv(
        '/home/ashwath/Programs/MAGCS/Evaluation/meanmetrics_mag_lda.tsv',
        sep='\t',
        index=True,
        header=False)
    print("C'est fini.")


df = pd.read_pickle(
    '/home/ashwath/Programs/MAGCS/MAG-hyperdoc2vec/recommendationsids.pickle')

df['lda_recommendations'] = df['context_for_lda'].progress_apply(lda_recommend)
print('lda recommendations done')
df['lda_binary'] = df[['ground_truth', 'lda_recommendations']].apply(
    lambda x: binarize_predictions(x.ground_truth, x.lda_recommendations),
    axis=1)
sleep(0.3)
print("Binarized")

df.to_pickle(
    '/home/ashwath/Programs/MAGCS/Pickles/ldarecommendations_mag_3models_500_df.pickle'
)
calculate_metrics(df[['lda_recommendations', 'lda_binary', 'ground_truth']])