예제 #1
0
def main():

    csv = pd.read_csv('city_aqi.csv')
    print(csv[['City', 'AQI']].sort_values('AQI').head(4))
    print(csv.info())

    print(csv.head())

    print('AQI最大值:', csv['AQI'].max())
    print('AQI最小值:', csv['AQI'].min())
    print('AQI均值:', csv['AQI'].mean())

    top5_cities = csv.sort_values(by=['AQI']).head(5)
    print('空气质量最好的5个城市:')
    print(top5_cities)

    top5_bottom_cities = csv.sort_values(by=['AQI'], ascending=False).head(5)
    print('空气质量最差的5个城市:')
    print(top5_bottom_cities)

    print(csv[csv['AQI'] > 40])

    top5_cities.plot(kind='bar',
                     x='City',
                     y='AQI',
                     title='空气质量最好的5个城市',
                     figsize=(10, 10))
    plt.savefig('top5_aqi.png')
    plt.show()
예제 #2
0
    def __init__(self, file, mono=True, cap_train=None, sort=False):
        if sort:
            csv = pd.read_csv(file)
            empty_counts = csv['puzzle'].apply(lambda p: p.count('0'))
            csv['num_empty'] = empty_counts
            csv.sort_values(by="num_empty", inplace=True)

            if cap_train is not None:
                csv = csv.head(cap_train)

            self.data = csv
        else:
            self.data = pd.read_csv(file, nrows=cap_train)
        self.mono = mono
        self.cap_train = cap_train
        self.edges = sudoku_edges()
예제 #3
0
 def read_tasks(self, mapper: DataMapper):
     self.data_mapper = mapper
     csv = pd.read_csv(self.filename, delimiter="\t")
     # 1 means to operate on the rows, not the columns.
     csv["Time Remaining"] = csv.apply(self.data_mapper.map_submission,
                                       axis=1)
     csv["Adjusted Priority"] = csv.apply(
         self.data_mapper.map_adjusted_priority, axis=1)
     csv = csv.sort_values(by=["Adjusted Priority"])
     csv["Sprint"] = csv["Estimate"].apply(self.data_mapper.map_sprints)
     self.data_mapper.sprint_manager.reset()
     csv["Confidence"] = csv.apply(self.data_mapper.map_sprint_confidence,
                                   axis=1)
     csv["On track"] = csv["Confidence"].apply(
         self.data_mapper.map_on_track())
     csv["Submitted"] = csv.apply(self.data_mapper.map_human_time_submitted,
                                  axis=1)
     csv["Due"] = csv.apply(self.data_mapper.map_human_time_due, axis=1)
     return csv
예제 #4
0
# In[21]:


def infra_density(num_o_houses, mask):
    area = (np.count_nonzero(mask)) * 10
    den = num_o_houses / area
    return den


# In[ ]:

# In[11]:

csv = pd.read_csv(csv_path)
csv.head(10)
x = csv.sort_values(by=['Village Name', 'Census 2011 ID'])
x.head(10)

# In[12]:

status = x.drop([23528], axis=0)
status.head(10)
status['Electrified'][0]

# In[74]:

index = 0

village_name = list()
ndvi = list()
evi_log = list()
예제 #5
0
def sortSeg(id):
    print("sorting %d" % id)
    csv = pd.read_csv('../result/segg/seg%d.csv' % id)
    csv.sort_values('times', inplace=True, ascending=False)
    csv.to_csv("../result/seg/%d.csv" % id, index=False)
예제 #6
0
        "slug":
        current["slug"],
        "name":
        current["name"],
        "creator_pseudo":
        current["creator"]["pseudo"],
        "categories":
        "|".join([e["name"] for e in current["categories"]]),
        "youtube_url":
        extract_url(current["links"], "youtube"),
        "twitter_url":
        extract_url(current["links"], "twitter"),
        "tip_amount":
        int(current["parameters"]["tipperAmount"]),
        "tip_number":
        int(current["parameters"]["tipperNumber"]),
    })

with open(FILENAME, "a") as f:
    writer = csv.DictWriter(f, data[0].keys(), lineterminator="\n")
    if f.tell() == 0:
        writer.writeheader()
    writer.writerows(data)

csv = pd.read_csv(FILENAME, parse_dates=["date"])

csv.drop_duplicates(subset=["date", "slug"], keep="last", inplace=True)
csv.sort_values(by=["date", "slug"], inplace=True)

csv.to_csv(FILENAME, index=False)
예제 #7
0
def clustering(file_name,times_thr, path_thr,n_clusters) : 
    """
    This function return a dictionary, which contains a cluster (as a key) and 
    the associated plates (as values of that key) 
    
     - **parameters**, **types**, **return** and **return types**::
          :param file_name: file name
          :param times_thr: threshold of times, it is going to select greater values
          :param path_thr: threshold of the lenght of the paths, it is going to select greater values
          :param n_cluster: number of clusters for Kmeans algorithm
          :type file_name: string
          :type times_thr: int
          :type path_thr: int 
          :type n_cluster: int
          :return: return a dictionary, which contains a cluster (as a key) and the associated plates (as values of that key) 
          :rtype: dictionary (int, [int,..])
    """
    csv= pd.read_csv(file_name, sep=',',index_col=None)
    csv = csv.loc[csv['volte']>=times_thr]
    sorted_csv = csv.sort_values('targa')
    csv_np  = sorted_csv.values
    plates = sorted_csv['targa'].unique()
    lista = list()
    total_paths = np.array([])

    for count,plate in enumerate(plates) :      
        paths =csv_np[np.where(csv_np[:,0] == plate),1][0]
        volte =csv_np[np.where(csv_np[:,0] == plate),2][0]     
        lista.insert(count, [plate])
        i=0
        while i < len(paths) :
            if  len(paths[i].split('-')) <= path_thr : 
                i+=1   
                continue       
            lista[count].append(paths[i])
            total_paths= np.append(total_paths,paths[i])
            lista[count].append(volte[i])
            i+=1
    total_paths = np.unique(total_paths)  
    plates = np.array([],dtype=int) 
    lista = [x for x in lista if len(x)>1]
    for x in lista :
        plates = np.append(plates,x[0]) 
        
    data = np.zeros(shape=(len(lista),len(total_paths)), dtype = np.int8)
    print "selected plates: "+str(plates)+"\nselected paths: " + str(total_paths)+"\nmatrix shape: " + str(data.shape) 
    for count,element in enumerate(lista) :
        i=1
        while i <  len(element) :
            index = total_paths.tolist().index(element[i])
            data[count,index] = element[i+1]
            i+=2
    print data 
    
    clusterer= KMeans(n_clusters=n_clusters, random_state=10)
    kmeans = clusterer.fit(data)
    cluster_labels = clusterer.fit_predict(data)
    silhouette_avg = silhouette_score(data, cluster_labels)
    print "For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_avg
    sample_silhouette_values = silhouette_samples(data, cluster_labels)
    clusters_map = {}
    for cluster in kmeans.labels_ :
        clusters_map[cluster] = []
    for i,plate in enumerate(lista) :
        clusters_map[kmeans.labels_[i]].append(plate[0])
    print "\ncluster\tplates"
    for k,v in clusters_map.items():
             print k,"\t",v
    fig, (ax1,ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(data[:, 0], data[:, 1], marker='.', s=30, lw=0, alpha=0.7,c=colors, edgecolor='k')
    centers = clusterer.cluster_centers_
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',c="white", alpha=1, s=200, edgecolor='k')
    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')
    plt.suptitle(("KMeans clustering on sample data "
                  "with n_clusters = %d\nThe average silhouette_score is %0.4f" % (n_clusters,silhouette_avg)),
                 fontsize=14, fontweight='bold')
    ax1.bar(clusters_map.keys(),[len(clusters_map[x]) for x in clusters_map.keys() ],color='r')
    ax1.set_ylabel("number of cars")
    ax1.set_xlabel("clusters")
    ax1.set_title("K-Means clustering")
    plt.show()