Python lexical_diversity示例

编程语言: Python

命名空间/包名称: nlp

方法/功能: lexical_diversity

hotexamples.com的示例: 3

Python lexical_diversity - 已找到3个示例。这些是从开源项目中提取的最受好评的nlp.lexical_diversity现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： MakePrediction.py 项目： jatinjindalj/CraigPredict

def MakePrediction(nr, title, location, price, nr_pics, description, nr_links,
                   contact, note, coord, car_model, cylinders, drive, fuel,
                   odometer, color, car_size, car_status, transmission,
                   car_type):
    #check model
    if "honda civic" in car_model.lower():
        learner = pickle.load(
            open("RFLearnerHondaCivicNoTextNYC" + str(nr) + ".p", "rb"))
        color_dict = pickle.load(open("Color_HondaCivic_Dict.p", "rb"))
        cartype_dict = pickle.load(open("CarType_HondaCivic_Dict.p", "rb"))
        location_dict = pickle.load(open("LocationDict_HondaCivic.p", "rb"))

    elif "toyota camry" in car_model.lower():
        learner = pickle.load(
            open("RFLearnerToyotaCamryNoTextNYC" + str(nr) + ".p", "rb"))
        color_dict = pickle.load(open("Color_ToyotaCamry_Dict.p", "rb"))
        cartype_dict = pickle.load(open("CarType_ToyotaCamry_Dict.p", "rb"))
        location_dict = pickle.load(open("LocationDict_ToyotaCamry.p", "rb"))
    elif "nissan altima" in car_model.lower():
        learner = pickle.load(
            open("RFLearnerNissanAltimaNoTextNYC" + str(nr) + ".p", "rb"))
        color_dict = pickle.load(open("Color_NissanAltima_Dict.p", "rb"))
        cartype_dict = pickle.load(open("CarType_NissanAltima_Dict.p", "rb"))
        location_dict = pickle.load(open("LocationDict_NissanAltima.p", "rb"))

    len_title = len(title.split())
    len_description = len(description.split())

    if color in color_dict.keys():
        color = color_dict[color]
    else:
        color = -100
    if car_type in cartype_dict.keys():
        car_type = cartype_dict[car_type]
    else:
        car_type = -100

    if location in location_dict.keys():
        location = location_dict[location]
    else:
        location = -100

    car_status = 1 if car_status == 'clean' else 0
    cylinders = 0 if cylinders == "" else 1
    drive = 0 if drive == "" else 1
    transmission = 1 if transmission == "automatic" else 0
    fuel = 1 if fuel == 'gas' else 0

    year = car_model.split(" ")
    year = year[0]
    if odometer < 1000 and year < 2015:
        odometer *= 1000
    if odometer == "":
        odometer = -10000
    ratio_sentences_words = nlp.sentence_count(
        description.decode('utf-8')) / (len_description + 1.)
    ratio_sentences_words /= (len_description + 1.)
    lex_diversity = nlp.lexical_diversity(description.decode('utf-8'))

    X1 = pd.Series([
        contact, nr_pics, price, len_title, len_description, coord, cylinders,
        drive, fuel, odometer, color, car_status, transmission, year, car_type,
        nr_links, ratio_sentences_words, lex_diversity, location
    ],
                   index=[
                       'Contact', 'NrPics', 'Price', 'LenTitle',
                       'LenDescription', 'COORD', 'Cylinders', 'Drive', 'Fuel',
                       'Odometer', 'Color', 'CarStatus', 'Transmission',
                       'Year', 'CarType', 'NrLinks', 'RatioSentencesWords',
                       'LexDiversity', 'LocationCats'
                   ])

    prob = learner.predict_proba(X1.values)
    prob = prob.ravel()
    prob = int(prob[1] * 100)

    X1S = X1.copy()
    max_prob = prob
    best_pic = nr_pics
    best_price = price
    best_desc = len_description
    best_ratio = ratio_sentences_words
    best_lex = lex_diversity
    best_pic_index = 1
    best_price_index = 1
    best_desc_index = 1
    best_ratio_index = 1
    best_lex_index = 1
    changes = [0, 1, 3]
    for ch_pic in changes:  #50% less, 100% more
        if X1S['NrPics'] != 0:
            X1S['NrPics'] = int((ch_pic * 0.5 + 0.5) * nr_pics)
        else:
            X1S['NrPics'] = 0
            best_pic_index = 1
        for pr in range(3):  #100,95,90%,
            X1S['Price'] = (100 - pr * 5) * 0.01 * price
            for len_des in changes:
                X1S['LenDescription'] = int(
                    (len_des * 0.5 + 0.5) * len_description)
                for rs in range(3):
                    X1S['RatioSentencesWords'] = (
                        rs * 0.5 + 0.5) * X1['RatioSentencesWords']
                for ld in range(3):
                    X1S['LexDiversity'] = int(ld * 0.5 +
                                              0.5) * X1['LexDiversity']
                    prob1 = learner.predict_proba(X1S.values)
                    prob1 = prob1.ravel()
                    prob1 = int(prob1[1] * 100)

                if prob1 > max_prob:
                    best_pic = X1S['NrPics']
                    best_pic_index = ch_pic
                    best_price = X1S['Price']
                    best_price_index = pr
                    best_desc = X1S['LenDescription']
                    best_desc_index = len_des
                    best_ratio = X1S['RatioSentencesWords']
                    best_ratio_index = rs
                    best_lex = X1S['LexDiversity']
                    best_lex_index = ld
                    max_prob = prob1

    messages = []
    print ratio_sentences_words, 0.5 * X1['RatioSentencesWords'], best_ratio
    if best_pic_index == 0:
        messages.append("- Include half as many pictures. \n")
    elif best_pic_index == 3:
        messages.append("- Include twice as many pictures. \n")
    if best_price_index == 1:
        messages.append(
            "- Reduce the price by 5 percent to {} dollars. \n".format(price *
                                                                       0.95))
    elif best_price_index == 2:
        messages.append(
            "-  Reduce the price by 10 percent to {} dollars. \n".format(
                price * 0.9))
    if best_desc_index == 0:
        messages.append(
            "- Reduce the number of words in the description by  50 percent. \n"
        )
    elif best_desc_index == 3:
        messages.append(
            "- Include twice as many words in  the description. \n")
    if best_ratio_index == 0:
        messages.append("- Use longer sentences in the description. \n")
    elif best_ratio_index == 2:
        messages.append("- Use shorter sentences in the description. \n")
    if best_lex_index == 0:
        messages.append("- Formulate a less lexically diverse description.\n")
    elif best_lex_index == 2:
        messages.append("- Formulate a more lexically diverse description. \n")
    if len(messages) > 0:
        messages.insert(
            0,
            "Your car will sell within {} days with a probability of {} percent if you \n"
            .format(nr, max_prob))

    return (prob, messages)

示例#2

显示文件

文件： MakePrediction.py 项目： cmn2014/CraigPredict

def MakePrediction(nr,title,location, price,nr_pics,description,nr_links,contact,note,coord,car_model, cylinders, drive, fuel, odometer, color, car_size, car_status, transmission, car_type):
    #check model
    if "honda civic" in car_model.lower():
        learner = pickle.load( open( "RFLearnerHondaCivicNoTextNYC"+str(nr)+".p", "rb" ) )
        color_dict =  pickle.load( open("Color_HondaCivic_Dict.p","rb")  )
        cartype_dict =  pickle.load( open("CarType_HondaCivic_Dict.p","rb")  )
        location_dict =  pickle.load( open("LocationDict_HondaCivic.p","rb")  )
        
    elif "toyota camry" in car_model.lower() :
        learner = pickle.load( open( "RFLearnerToyotaCamryNoTextNYC"+str(nr)+".p", "rb" ) )
        color_dict =  pickle.load( open("Color_ToyotaCamry_Dict.p","rb")  )
        cartype_dict =  pickle.load( open("CarType_ToyotaCamry_Dict.p","rb")  )
        location_dict =  pickle.load( open("LocationDict_ToyotaCamry.p","rb")  )
    elif "nissan altima" in car_model.lower() :
        learner = pickle.load( open( "RFLearnerNissanAltimaNoTextNYC"+str(nr)+".p", "rb" ) )
        color_dict =  pickle.load( open("Color_NissanAltima_Dict.p","rb")  )
        cartype_dict =  pickle.load( open("CarType_NissanAltima_Dict.p","rb")  )
        location_dict =  pickle.load( open("LocationDict_NissanAltima.p","rb")  )
    
    len_title = len(title.split()) 
    len_description = len(description.split())
   
    if color in color_dict.keys():
        color = color_dict[color] 
    else:
        color = -100
    if car_type in cartype_dict.keys():
        car_type = cartype_dict[car_type] 
    else:
        car_type = -100

    if location in location_dict.keys():
        location = location_dict[location]
    else:
        location = -100

    car_status = 1 if car_status == 'clean' else 0
    cylinders = 0 if cylinders == "" else 1
    drive= 0 if drive == "" else 1
    transmission = 1 if transmission == "automatic" else 0
    fuel = 1 if fuel == 'gas' else 0
  
    year = car_model.split(" ")
    year = year[0]
    if odometer < 1000 and year < 2015: 
        odometer *= 1000
    if odometer == "":
        odometer = -10000
    ratio_sentences_words = nlp.sentence_count(description.decode('utf-8'))/(len_description+1.)
    ratio_sentences_words /= (len_description+1.)
    lex_diversity = nlp.lexical_diversity(description.decode('utf-8'))

    X1 = pd.Series([contact,nr_pics,price,len_title ,len_description, coord,
                    cylinders,drive,fuel,odometer,color,car_status, transmission, year, car_type, nr_links,ratio_sentences_words,lex_diversity, location] ,
                    index = ['Contact','NrPics','Price','LenTitle','LenDescription','COORD',
                             'Cylinders','Drive','Fuel','Odometer','Color','CarStatus','Transmission','Year','CarType','NrLinks','RatioSentencesWords','LexDiversity','LocationCats'])
   
    prob = learner.predict_proba(X1.values)
    prob = prob.ravel()
    prob = int(prob[1] * 100)

    X1S = X1.copy()
    max_prob = prob
    best_pic = nr_pics
    best_price = price
    best_desc = len_description 
    best_ratio = ratio_sentences_words
    best_lex = lex_diversity
    best_pic_index = 1
    best_price_index = 1
    best_desc_index = 1
    best_ratio_index = 1
    best_lex_index = 1
    changes = [0,1,3]
    for ch_pic in changes: #50% less, 100% more
        if X1S['NrPics'] != 0:
            X1S['NrPics'] = int((ch_pic*0.5  + 0.5) *nr_pics)
        else:
            X1S['NrPics'] = 0
            best_pic_index = 1
        for pr in range(3): #100,95,90%,
            X1S['Price'] =  (100- pr*5) *0.01 * price
            for len_des in changes:
                X1S['LenDescription'] = int((len_des *0.5 +0.5) * len_description)
                for rs in range(3):
                    X1S['RatioSentencesWords'] = (rs*0.5 + 0.5) * X1['RatioSentencesWords']
                for ld in range(3):
                    X1S['LexDiversity'] = int(ld*0.5 + 0.5) * X1['LexDiversity']
                    prob1 = learner.predict_proba(X1S.values)
                    prob1 = prob1.ravel()
                    prob1 = int(prob1[1] * 100)
                            
                if prob1 > max_prob:
                    best_pic = X1S['NrPics']
                    best_pic_index = ch_pic
                    best_price = X1S['Price']
                    best_price_index = pr
                    best_desc = X1S['LenDescription']
                    best_desc_index = len_des
                    best_ratio = X1S['RatioSentencesWords']
                    best_ratio_index = rs
                    best_lex = X1S['LexDiversity']
                    best_lex_index = ld
                    max_prob = prob1
                          
               
    messages = []
    print ratio_sentences_words,0.5*X1['RatioSentencesWords'],best_ratio
    if best_pic_index == 0:
        messages.append("- Include half as many pictures. \n")
    elif best_pic_index == 3:
        messages.append("- Include twice as many pictures. \n")
    if best_price_index == 1:
        messages.append("- Reduce the price by 5 percent to {} dollars. \n".format(price*0.95))
    elif best_price_index == 2:
        messages.append("-  Reduce the price by 10 percent to {} dollars. \n".format(price*0.9))
    if best_desc_index == 0:
        messages.append("- Reduce the number of words in the description by  50 percent. \n")
    elif best_desc_index == 3:
        messages.append("- Include twice as many words in  the description. \n")
    if best_ratio_index == 0:
        messages.append("- Use longer sentences in the description. \n")
    elif best_ratio_index == 2: 
        messages.append("- Use shorter sentences in the description. \n")
    if best_lex_index == 0: 
        messages.append("- Formulate a less lexically diverse description.\n")
    elif best_lex_index == 2: 
        messages.append("- Formulate a more lexically diverse description. \n")
    if len(messages) > 0:
        messages.insert(0,"Your car will sell within {} days with a probability of {} percent if you \n".format(nr, max_prob))
    
    
    return (prob,  messages)

示例#3

显示文件

文件： TrainModel.py 项目： cmn2014/CraigPredict

def trainModel():
    con = mdb.connect('localhost', 'charlotte', 'insight', 'LocalClassifieds')
    df = pd.read_sql("SELECT * FROM ToyotaCamry", con)
    #df = pd.read_sql("SELECT * FROM NissanAltima", con)
    data = df[ (df['SoldDays'] > 0)    & (df['CarModel'].str.contains(r"[tT][oO][yY][oO][tT][aA] [cC][aA][mM][rR][yY]"))   ]
    #data = df[ (df['SoldDays'] > 0)   & (df['CarModel'].str.contains(r"[nN][iI][sS][sS][aA][nN] [aA][lL][tT][iI][mM][aA]")) ]
    data.reset_index(inplace = True)
    print "Number of training items: {}".format(len(data))
    year = data['CarModel'].str.split(" ")
    data['Year'] =  [int(y[0]) for y in year]
    data['Odometer'] =  data['Odometer'].convert_objects(convert_numeric=True)
    data.loc[(data['Odometer'] < 1000) & (data['Year'] < 2015),'Odometer'] *= 1000  #often, 178000 given as 178etc.
    data.loc[data['Odometer'].isnull(), 'Odometer'] = -10000
    #variables: length of title, length of description, lexical diversity, nr sentences/nr words
    data['LenTitle'] = data['Title'].str.split().apply(lambda x: len(x)) -1
    data['LenDescription']  =  data['Description'].str.split().apply(lambda x: len(x))
    data['RatioSentencesWords'] = [nlp.sentence_count(text.decode('utf-8')) for text in data['Description'].values]
    data['RatioSentencesWords'] = data['RatioSentencesWords']/ (data['LenDescription']+1.)
    
    data['LexDiversity'] =  [nlp.lexical_diversity(text.decode('utf-8')) for text in data['Description'].values]
   
    data.loc[(data['Fuel'] == 'gas'), 'Fuel'] = 1
    data.loc[data['Fuel'] != 1, 'Fuel']= 0
    data.loc[(data['Transmission'] == 'automatic'), 'Transmission'] = 1
    data.loc[data['Transmission'] != 1, 'Transmission']= 0
    data.loc[(data['Cylinders'] == ""), 'Cylinders'] = 0
    data.loc[data['Cylinders'] != 0, 'Cylinders']= 1   
    data.loc[(data['Drive'] == ""), 'Drive'] = 0
    data.loc[data['Drive'] != 0, 'Drive']= 1   
    data.loc[(data['CarStatus'] == "clean"), 'CarStatus'] = 1
    data.loc[data['CarStatus'] != 1, 'CarStatus']= 0 
    for col in data[['Color','CarType']].columns:
        categories = data[col].unique()
        cat_dict = {}
        i=1
        for cat in categories:
            cat_dict.update({cat: i})
            i += 1
        data[col] = data[col].map(cat_dict)    
        #par = pickle.dump(cat_dict, open( col+"_ToyotaCamry_Dict.p", "wb" ) )
        par = pickle.dump(cat_dict, open( col+"_NissanAltima_Dict.p", "wb" ) )
    
    data.loc[data['Location'].isnull(), 'Location'] = ""
    data['Location'] = data['Location'].str.lower()
    locs = data['Location'].unique()
    loc_cts = data['Location'].value_counts()
    loc_dict = {}
    i=1
    for loc in locs:
        if loc_cts[loc] >= 5:
            loc_dict.update({loc: i})
        else: 
            loc_dict.update({loc: 0})
        i += 1
    data['LocationCats'] = data['Location'].map(loc_dict)
    #par = pickle.dump(loc_dict,open("LocationDict_ToyotaCamry.p","wb"))
    par = pickle.dump(loc_dict,open("LocationDict_NissanAltima.p","wb"))

    X1 = data[['Contact','NrPics','Price','LenTitle','LenDescription','COORD','Cylinders','Drive','Fuel',
    'Odometer','Color','CarStatus','Transmission','Year','CarType','NrLinks','LexDiversity',
    'RatioSentencesWords','LocationCats']].values
    DataDict = {}
    DataDict.update({0: 'Contact',  1: 'NrPics',  2 :'Price', 3: 'LenTitle', 4 : 'LenDescription',
                     5: 'COORD',  6: 'Cylinders', 7 :'Drive', 8: 'Fuel', 9: 'Odometer',10: 'Color',
                    11: 'CarStatus', 12: 'Transmission', 13: 'Year', 14: 'CarType',15:'NrLinks',
                    16: 'LexDiversity',17:'RatioSentencesWords', 18: 'LocationCats'})

    X = X1
    max_scores = np.zeros(31)
    best_feat = np.zeros(31)
    best_depth = np.zeros(31)
    features = []
    #params = open('BestParamsToyotaCamry_NY_NoText.txt', 'w+')
    params = open('BestParamsNissanAltima_NY_NoText.txt', 'w+')
    for n in range(1,21,1):
        data['Sold'] = 0
        data.loc[data['SoldDays'] <= n, 'Sold'] = 1
        YC = data['Sold']   #Classification model output variable
        print data['Sold'].value_counts()
        for max_feat in range(2,16,2):
            for depth in range(30,110,20):
                n_folds = 10
                cv = sklearn.cross_validation.StratifiedKFold(YC,n_folds,shuffle = True)
                scores = np.zeros(n_folds)
                for f,(train, test) in enumerate(cv):
                    learner = RandomForestClassifier(n_estimators=300, max_depth=depth,  max_features= max_feat)               
                    learner.fit(X[train,:],YC[train])
                    probs = learner.predict_proba(X[test,:])    
                    if probs.shape[1] > 1:
                        fpr, tpr, thresholds = sklearn.metrics.roc_curve(YC[test], probs[:,1])
                        scores[f] = sklearn.metrics.auc(fpr,tpr)
                    else:
                        scores[f] = 0
                if np.mean(scores) > max_scores[n]:
                    max_scores[n] = np.mean(scores)
                    best_feat[n] = max_feat
                    best_depth[n] = depth
                    features.append( learner.feature_importances_)
                         
        print "For n = {}, best AUC is {}  for max feat = {}, max depth = {}".format(n, max_scores[n],  best_feat[n], best_depth[n])
        params.write("For n = {}, best AUC is {}  for max feat = {}, max depth = {}\n".format(n, max_scores[n],  best_feat[n], best_depth[n]))
        coeff = features[n]
        sorted_coeff = np.sort(coeff)
        sorted_indices = np.argsort(coeff)
        for i in range(1,19):
            
            index = sorted_indices[len(coeff)-i]
            print DataDict[index], sorted_coeff[len(coeff)-i]
    params.close()

   
    for n in range(1,21,1):
        print n, best_depth[n], best_feat[n]
        data['Sold'] = 0
        data.loc[data['SoldDays'] <= n, 'Sold'] = 1
        YC = data['Sold']
        learner = RandomForestClassifier(n_estimators=500, max_depth=best_depth[n],  max_features= int(best_feat[n]))                    
        learner.fit(X,YC)
        #par = pickle.dump(learner, open( "RFLearnerToyotaCamryNoTextNYC"+str(n)+".p", "wb" ) )
        #par = pickle.dump(learner, open( "RFLearnerNissanAltimaNoTextNYC"+str(n)+".p", "wb" ) )
 
    for n in range(3,18,3):
        data['Sold'] = 0
        data.loc[data['SoldDays'] <= n, 'Sold'] = 1
        YC = data['Sold']
        n_folds = 5
        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)
        all_tpr = []
        cv = sklearn.cross_validation.StratifiedKFold(YC,n_folds,shuffle = True)
        for f,(train, test) in enumerate(cv):
            learner = RandomForestClassifier(n_estimators=1000, max_depth=best_depth[n],  max_features= int(best_feat[n]) )                   
            learner.fit(X[train,:],YC[train])
            probs = learner.predict_proba(X[test,:])
            if probs.shape[1] > 1:
                fpr, tpr, thresholds = sklearn.metrics.roc_curve(YC[test], probs[:,1])
                auc = sklearn.metrics.auc(fpr,tpr)
                mean_tpr += interp(mean_fpr, fpr, tpr)
                mean_tpr[0] = 0.0
            

        mean_tpr /= len(cv)
        mean_tpr[-1] = 1.0
        mean_auc = sklearn.metrics.auc(mean_fpr, mean_tpr)
        plt.plot(mean_fpr, mean_tpr, '-', label='Mean CV ROC d =%d (AUC = %0.2f) ' % (n,mean_auc), lw=2)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Toyota Camry: Prob. of sale within d days')
    #plt.title('ROC Nissan Altima: Prob. of sale within d days')
    plt.legend(loc="lower right")
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    #plt.savefig('ROC_RF_NoText_NissanAltima_NYC.pdf')
    plt.savefig('ROC_RF_NoText_ToyotaCamry_NYC.pdf')