示例#1
0
def format_dataframe(df: DataFrame) -> Tuple[DataFrame, DataFrame]:
    """
    Create unique descriptor vector id and generate mapping table T5 and save duplicates.

    Args:
        df (DataFrame): dataframe with descriptor features and values, as well as input compound id.

    Returns:
        Tuple[Dataframe, Dataframe, DataFrame]: T5 mapping table, T6_table, descriptor-based duplicates
    """
    # identify duplicated fingerprint, create unique descriptor vector ID for them,
    # and sort them according to the new descriptor ID
    df["descriptor_vector_id"] = df.groupby(["fp_feat", "fp_val",
                                             "fold_id"]).ngroup()
    # extract the mapping table before duplicate checking
    df_T5 = df[["input_compound_id", "fold_id", "descriptor_vector_id"]]
    # we sort now, as we need to have sorted datframes for T6 and the duplicates
    df = df.sort_values("descriptor_vector_id")
    # we identify duplciates
    # duplicate removal based desriptor vector ID is sufficient, becuase it implies unique 'fp_feat', 'fp_val', 'fold_id' combinations
    df_T6 = df.drop_duplicates("descriptor_vector_id")[[
        "descriptor_vector_id", "fp_feat", "fp_val", "fold_id"
    ]]
    is_duplicated = df.duplicated(["descriptor_vector_id"], keep=False)
    df_duplicates = df.loc[is_duplicated, [
        "input_compound_id",
        "canonical_smiles",
        "fp_feat",
        "fp_val",
        "fold_id",
        "descriptor_vector_id",
    ], ]

    return df_T5, df_T6, df_duplicates
示例#2
0
def find_similar_test(query_str, str_list):
    # df_temp=df
    similarity = []
    for index in range(len(str_list)):
        # 文本相似度————汉明距离
        if len(str_list[index]) - len(query_str) > 0:
            hanming = []
            for i in range(len(str_list[index]) - len(query_str) + 1):
                str_comp = str_list[index][i:i + len(query_str)]
                han = sum(el1 == el2 for el1, el2 in zip(str_comp, query_str))
                hanming.append(han)
            sim = numpy.max(hanming)
        else:
            hanming = []
            for i in range(len(query_str) - len(str_list[index]) + 1):
                str_comp = query_str[i:i + len(str_list[index])]
                han = sum(el1 == el2
                          for el1, el2 in zip(str_comp, str_list[index]))
                hanming.append(han)
            sim = numpy.max(hanming)
        similarity.append(sim)

    # similarity是相似度列表 比较两种方法的正确程度可能可以从similarity入手?
    df_str = {"similarity": similarity, "str": str_list}
    str_with_sim = DataFrame(df_str)
    #print str_with_sim
    # 可以设置head里面的数字来控制最接近的n个值 根据相似度距离来对dataframe进行排序,
    str_with_sim = str_with_sim.drop_duplicates().sort_values(
        by=['similarity'], ascending=False)
    # str_with_sim = str_with_sim[str_with_sim.similarity == max(similarity)].drop_duplicates()
    print(str_with_sim)
示例#3
0
 def get_nodes_edges(self, graph):
     '''
     从网络中将节点和边存储下来,
     只提取最大联通字图的边和节点
     '''
     #edges = graph.edges()
     res = [
         c for c in sorted(
             nx.connected_components(graph), key=len, reverse=True)
     ]
     nodes = res[0]
     edges = np.array(graph.edges())
     sub_graph_edges = edges[np.isin(edges[:, 0], nodes) == np.isin(
         edges[:, 1], nodes)]
     sub_graph_edges = DataFrame(sub_graph_edges)
     sub_graph_edges.drop_duplicates(inplace=True)
     return nodes, sub_graph_edges
示例#4
0
def get_count():
  all_item_id = []
  user_count = len(train_set_mini['user_id'].drop_duplicates())
  cate_count = len(train_set_mini['cate_list'].drop_duplicates())

  train_set_mini['viewed_item_id_list'] = train_set_mini.viewed_item_id.apply(lambda x: x[1:-1].split(','))
  train_set_mini_item_id_list = train_set_mini.viewed_item_id_list
  for item_id_list in train_set_mini_item_id_list:
    all_item_id.extend(item_id_list)
  all_item_id_df = DataFrame(all_item_id)
  item_count = len(all_item_id_df.drop_duplicates())
  return user_count, item_count, cate_count
    def __getitem__(self, index):
        entry = self.entries[index]

        features = np.load(
            os.path.join(self.data_dir, self.name,
                         str(entry['index']) + '.npy'))
        boxes = entry['boxes']

        if self.ROI_select == 0:
            features = features[:self.max_area_boxes]
            boxes = boxes[:self.max_area_boxes]
        elif self.ROI_select == 1:  # hard sample中只尝试了按面积筛选,但是最高分数中并未用到
            ##根据面积来截断TOP box
            if boxes.shape[0] > self.max_area_boxes:
                area_count = []
                for index, box in enumerate(boxes):
                    area = (box[2] - box[0]) * (box[3] - box[1])
                    area_count.append(area)

                data = DataFrame({
                    "index": range(len(area_count)),
                    'area': area_count
                })

                data = data.loc[data['area'].rank() > len(area_count) -
                                self.max_area_boxes]
                data = data.drop_duplicates(subset=['area'])
                data_index_2 = data['index'].tolist()

                features = features[data_index_2]
                boxes = boxes[data_index_2]

        features = np.pad(features,
                          ((0, self.max_area_boxes - features.shape[0]),
                           (0, 0)),
                          mode='constant',
                          constant_values=0)

        img_h, img_w = entry['image_h'], entry['image_w']
        boxes = boxes.copy()
        boxes[:, (0, 2)] /= img_w
        boxes[:, (1, 3)] /= img_h

        boxes = np.pad(boxes,
                       ((0, self.max_area_boxes - boxes.shape[0]), (0, 0)),
                       mode='constant',
                       constant_values=0)

        query = entry['query']
        question_id = entry['query_id']
        product_id = entry['product_id']

        return features, boxes, query, question_id, product_id
示例#6
0
def _drop_duplicates(df: DataFrame, cols: Sequence[str]) -> DataFrame:
    """Drop duplicates and then sort the DataFrame.

    Args:
        df: DataFrame to have duplicates removed.
        cols: Columns for use in removing duplicates and for sorting.

    Returns:
        A DataFrame with duplicates removed (only the last duplicate is kept).
        The DataFrame is sorted according to the columns provided.
    """
    df = df.drop_duplicates(subset=cols, keep="last")
    return df.sort_values(by=cols, ignore_index=True)
示例#7
0
 def get_idList(self, bw_id=None):
     with open(self.filename, 'r', encoding='utf-8') as f:
         reader = csv.DictReader(f)
         idList = [row['bw_id'] for row in reader]
         if self.temp:
             # 减少转发紊乱导致的重复爬取
             # 用set会重新排序,则断点失效
             df = DataFrame(idList)
             df.columns = ['bw_id']
             df = df.drop_duplicates(keep='last')
             idList = df['bw_id']
             idList = idList.tolist()
             if bw_id:
                 pos = idList.index(bw_id)  # 必须为字符串形式
                 idList = idList[pos + 1:]
         return idList
def quChong(inputFileName):
    data = getLocation(inputFileName)
    df = DataFrame(data,
                   columns=[
                       'Station MAC', 'First time seen', 'Last time seen',
                       'Power', ' packets', 'BSSID', 'Probed ESSIDs',
                       'location'
                   ])
    newDF = df.drop_duplicates()
    newDF = newDF.drop(0)
    #print(newDF)
    #print(type(newDF))

    #print(newDF.dtypes)
    #print(newDF.head())
    return newDF
示例#9
0
def label_encode(df):
    """
    become nominal value to number value
    :return: label encoded
    """
    sql_manager = SqlManager("information.sqlite")
    for column in main_columns:
        if str(df[column].dtype) == "object":
            le = preprocessing.LabelEncoder()
            label_encoded = le.fit_transform(df[column])
            df2 = DataFrame({"main": df[column].copy()})
            df[column] = label_encoded
            df2["encode"] = df[column].copy()
            df2 = df2.drop_duplicates()
            df2.to_sql(name="encoding_guide",
                       con=sql_manager.conn,
                       if_exists="replace",
                       index=False)
示例#10
0
def label_encode(column):
    """
    become nominal value to number value
    :param column: each column
    :return: label encoded
    """
    sql_manager = SqlManager("information.sqlite")
    column_value = sql_manager.crs.execute(
        'select  {} from information '.format(column)).fetchall()

    labels = [x[0] for x in list(column_value)]
    if type(labels[0]) == int:
        label_encoded = labels
    else:
        le = preprocessing.LabelEncoder()
        label_encoded = le.fit_transform(labels)
    col_list = []
    for i in range(len(label_encoded)):
        col_list.append(column)
    df = DataFrame({"Lable": labels, "encode": label_encoded, "column": column})
    df = df.drop_duplicates()
    df.to_sql(name="encoding_guide", con=sql_manager.conn, if_exists="append")
    return label_encoded
class MainApp(QMainWindow, ui):
    def __init__(self):
        super().__init__()
        QMainWindow.__init__(self)
        self.setupUi(self)
        self.HandleButtons()
        self.InitUI()
        #Store dataset to this
        self.data_train = DataFrame()
        self.data_test = DataFrame()
        self.columnsRemove = []
        self.data_cleaned = DataFrame()
        self.train = True

    def InitUI(self):
        self.tabWidget.tabBar().setVisible(False)

        #Disabling remove columns before loading dataset for training
        self.listWidget_data_train.setEnabled(False)

        style = open('./themes/default.css', 'r')
        style = style.read()
        self.setStyleSheet(style)

    def HandleButtons(self):
        self.button_data_train.clicked.connect(self.HandleTrainBrowse)
        self.button_data_test.clicked.connect(self.HandleRunBrowse)
        self.button_drop.clicked.connect(self.RemoveColumn)
        self.button_drop_2.clicked.connect(self.RemoveColumn)
        self.button_train.clicked.connect(self.TrainModel)
        self.button_run.clicked.connect(self.RunModel)
        self.pushButton.clicked.connect(self.Open_Create)
        self.pushButton_2.clicked.connect(self.Open_Run)
        self.pushButton_3.clicked.connect(self.Open_Summary)
        self.pushButton_4.clicked.connect(self.open_Settings)
        self.button_model.clicked.connect(self.HandleModelBrowse)
        self.button_summary.clicked.connect(self.Summary)
        self.button_darkblue.clicked.connect(self.Apply_DarkBlue_Style)
        self.button_darkorange.clicked.connect(self.Apply_DarkOrange_Style)
        self.button_dark.clicked.connect(self.Apply_QDark_Style)
        self.button_darkgray.clicked.connect(self.Apply_DarkGray_Style)

    def GetLocation(self, operation: str, filter: str, caption: str) -> str:
        ''' Get file location either save or open file '''
        if operation == 'open':
            return QFileDialog.getOpenFileName(self,
                                               caption=caption,
                                               directory='.',
                                               filter=filter)[0].strip()
        elif operation == 'save':
            return QFileDialog.getSaveFileName(self,
                                               caption=caption,
                                               directory='.',
                                               filter=filter)[0].strip()

    def HandleTrainBrowse(self):
        ## enable browseing to our os , pick save location
        save_location: str = self.GetLocation(operation='open',
                                              caption="Open",
                                              filter="CSV Files(*.csv)")
        print(save_location)
        if (save_location != ''):
            self.lineEdit_data_train.setText(str(save_location))

            #display columns in listWidget
            self.data_train = pd.read_csv(self.lineEdit_data_train.text())
            cols = self.data_train.columns.values.tolist()
            print(cols)
            self.listWidget_data_train.addItems(cols)
            self.listWidget_data_train.setEnabled(True)
            self.button_drop.setEnabled(True)
            self.train = True

    def HandleModelBrowse(self):
        self.model_location = self.GetLocation(operation='open',
                                               caption="Open",
                                               filter="JobLib Files(*.joblib)")
        if (self.model_location != ''):
            self.lineEdit_model.setText(str(self.model_location))

    def HandleRunBrowse(self):
        ## enable browseing to our os , pick save location
        data_location = self.GetLocation(operation='open',
                                         caption="Open",
                                         filter="CSV Files(*.csv)")
        if data_location != '':
            self.lineEdit_data_test.setText(str(data_location))
            #display columns in listWidget
            self.data_test = pd.read_csv(self.lineEdit_data_test.text())
            cols = self.data_test.columns.values.tolist()
            print(cols)
            self.listWidget_data_test.addItems(cols)
            self.listWidget_data_test.setEnabled(True)
            self.button_drop_2.setEnabled(True)
            self.train = False

    def RemoveColumn(self):
        if (self.train):
            items = self.listWidget_data_train.selectedItems()
            list = self.listWidget_data_train
            data = self.data_train
        else:
            items = self.listWidget_data_test.selectedItems()
            list = self.listWidget_data_test
            data = self.data_test
        if items is None:
            return
        reply = QMessageBox.question(
            self, "Drop",
            "Remove`{0}'?".format(' '.join(map(lambda item: item.text(),
                                               items))),
            QMessageBox.Yes | QMessageBox.No, QMessageBox.No)
        if reply == QMessageBox.Yes:
            for item in items:
                row = list.row(item)
                item = list.takeItem(row)
                self.columnsRemove.append(item.text())
                del item
            #Delete from dataframe only in training
            self.data_cleaned = data.drop(columns=self.columnsRemove,
                                          inplace=self.train)

    def TrainModel(self):
        print(self.data_train.columns)
        self.listWidget_data_train.clear()
        self.columnsRemove.clear()
        save_location = self.GetLocation(operation='save',
                                         caption="Save as",
                                         filter="JobLib Files(*.joblib)")
        if save_location != '':
            print(save_location, 'model train start')
            #train model
            self.data_train.dropna(inplace=True)
            self.data_train.drop_duplicates(inplace=True)
            X = pd.get_dummies(self.data_train)
            kmeans = KMeans(init='k-means++',
                            max_iter=300,
                            n_init=10,
                            random_state=4)
            scaler = MinMaxScaler()
            scaled_features = scaler.fit_transform(X)
            visualizer = KElbowVisualizer(kmeans,
                                          k=(4, 12),
                                          metric='silhouette',
                                          timings=False)

            visualizer.fit(X)

            if (not visualizer.elbow_value_):
                clusterValue = 3
            else:
                clusterValue = visualizer.elbow_value_
            kmeans = KMeans(max_iter=300,
                            n_init=10,
                            random_state=4,
                            n_clusters=clusterValue)
            print(clusterValue)
            kmeans.fit(scaled_features)
            #save model
            dump(kmeans, save_location + '.joblib')
            print('model train done')

    def RunModel(self):
        print(self.data_cleaned.columns)
        self.listWidget_data_test.clear()
        self.model = load(self.model_location)
        self.columnsRemove.clear()
        self.data_cleaned.dropna(inplace=True)
        self.data_cleaned.drop_duplicates(inplace=True)
        X = pd.get_dummies(self.data_cleaned)
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(X)
        y_means = self.model.predict(scaled_features)
        self.data_cleaned['Cluster'] = y_means
        self.data_cleaned.to_csv('output.csv')

    def Summary(self):
        data_location = self.GetLocation('open', 'CSV Files(*.csv)', 'Open')
        if data_location != '':
            self.lineEdit_summary.setText(data_location)
            df = pd.read_csv(data_location)
            summary_df = df.describe()

            #Row count
            row = summary_df.shape[0]
            self.tableWidget.setRowCount(row)

            #Column count
            column = summary_df.shape[1]
            self.tableWidget.setColumnCount(column)

            self.tableWidget.setHorizontalHeaderLabels(
                summary_df.columns.values.tolist())
            self.tableWidget.setVerticalHeaderLabels(
                summary_df.index.values.tolist())
            print(row, column)
            for i in range(row):
                for j in range(column):
                    self.tableWidget.setItem(
                        i, j, QTableWidgetItem(str(summary_df.iloc[i, j])))
            self.tableWidget.resizeColumnsToContents()
            self.tableWidget.resizeRowsToContents()
            self.tableWidget.setEnabled(True)


################################################
###### UI CHanges Methods

    def Open_Create(self):
        self.tabWidget.setCurrentIndex(0)

    def Open_Run(self):
        self.tabWidget.setCurrentIndex(3)

    def Open_Summary(self):
        self.tabWidget.setCurrentIndex(2)

    def open_Settings(self):
        self.tabWidget.setCurrentIndex(1)

    ################################################
    ###### App Themes ####

    def Apply_DarkOrange_Style(self):
        style = open('./themes/darkorange.css', 'r')
        style = style.read()
        self.setStyleSheet(style)

    def Apply_QDark_Style(self):
        style = open('themes/qdark.css', 'r')
        style = style.read()
        self.setStyleSheet(style)

    def Apply_DarkGray_Style(self):
        style = open('themes/qdarkgray.css', 'r')
        style = style.read()
        self.setStyleSheet(style)

    def Apply_DarkBlue_Style(self):
        style = open('./themes/darkblue.css', 'r')
        style = style.read()
        self.setStyleSheet(style)
示例#12
0
# from .items import PcautoPowerSpider Item
connection = pymongo.MongoClient('192.168.1.94', 27017)
db = connection["newcar"]
collection = db["pcauto_tmp"]
model_data = collection.find({}, {
    "carid": 1,
    "brandname": 1,
    "factoryname": 1,
    "familyname": 1,
    "brandid": 1,
    "_id": 0
})

car_msg_list = list(model_data)
car_msg_df = DataFrame(car_msg_list)
car_msg_df_new = car_msg_df.drop_duplicates('carid')


class PcautoPowerSpider(scrapy.Spider):
    name = 'pcauto_power_minBtPrice'
    allowed_domains = ['pcauto.com']
    # start_urls = ['https://price.pcauto.com.cn/price/api/v1/serialgroup/serial_group_bt_data/r3-m85355']

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(getattr(
            cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False)
            else 'custom_settings', None) or {},
                         priority='spider')

    def __init__(self, **kwargs):
示例#13
0
def _prepare_dataframes(
    forecast_df: DataFrame,
    truth_df: DataFrame,
    percentiles: Optional[List[float]] = None,
    experiment: Optional[str] = None,
) -> Tuple[DataFrame, DataFrame]:
    """Prepare dataframes for conversion to cubes by: 1) checking
    that the expected columns are present, 2) checking the percentiles
    are as expected, 3) removing duplicates from the forecast and truth,
    4) finding the sites common to both the forecast and truth dataframes
    and 5) replacing and supplementing the truth dataframe with
    information from the forecast dataframe. Note that this third
    step will also ensure that a row containing a NaN for the
    ob_value is inserted for any missing observations.

    Args:
        forecast_df:
            DataFrame expected to contain the following columns: forecast,
            blend_time, forecast_period, forecast_reference_time, time,
            wmo_id, percentile, diagnostic, latitude, longitude, altitude,
            period, height, cf_name, units and experiment. Any other
            columns are ignored.
        truth_df:
            DataFrame expected to contain the following columns: ob_value,
            time, wmo_id, diagnostic, latitude, longitude and altitude.
            Any other columns are ignored.
        percentiles:
            The set of percentiles to be used for estimating EMOS coefficients.
        experiment:
            A value within the experiment column to select from the forecast
            table.

    Returns:
        A sanitised version of the forecasts and truth dataframes that
        are ready for conversion to cubes.
    """
    _dataframe_column_check(forecast_df, FORECAST_DATAFRAME_COLUMNS)
    _dataframe_column_check(truth_df, TRUTH_DATAFRAME_COLUMNS)

    # Filter to select only one experiment
    if experiment:
        forecast_df = forecast_df.loc[forecast_df["experiment"] == experiment]

    if forecast_df["experiment"].nunique() > 1:
        unique_exps = forecast_df["experiment"].unique()
        msg = (
            "More than one value for the experiment column found in the "
            f"forecast dataframe. Values for experiment column {unique_exps}")
        raise ValueError(msg)

    # Extract the required percentiles.
    if percentiles:
        indices = [
            np.isclose(forecast_df["percentile"], float(p))
            for p in percentiles
        ]
        forecast_df = forecast_df[np.logical_or.reduce(indices)]

    # Check the percentiles can be considered to be equally space quantiles.
    _quantile_check(forecast_df)

    # Remove forecast duplicates.
    forecast_df = forecast_df.drop_duplicates(
        subset=[
            "diagnostic", "forecast_period", "percentile", "time", "wmo_id"
        ],
        keep="last",
    )
    # Sort to ensure a consistent ordering after removing duplicates.
    forecast_df.sort_values(
        by=["blend_time", "percentile", "wmo_id"],
        inplace=True,
        ignore_index=True,
    )

    # Remove truth duplicates.
    truth_cols = ["diagnostic", "time", "wmo_id"]
    truth_df = truth_df.drop_duplicates(
        subset=truth_cols,
        keep="last",
    )
    # Sort to ensure a consistent ordering after removing duplicates.
    truth_df.sort_values(
        by=truth_cols,
        inplace=True,
        ignore_index=True,
    )

    # Find the common set of WMO IDs.
    common_wmo_ids = sorted(
        set(forecast_df["wmo_id"].unique()).intersection(
            truth_df["wmo_id"].unique()))
    forecast_df = forecast_df[forecast_df["wmo_id"].isin(common_wmo_ids)]
    truth_df = truth_df[truth_df["wmo_id"].isin(common_wmo_ids)]

    # Ensure time in forecasts is present in truths.
    forecast_df = forecast_df[forecast_df["time"].isin(
        truth_df["time"].unique())]

    # Ensure time in truths is present in forecasts.
    truth_df = truth_df[truth_df["time"].isin(forecast_df["time"].unique())]

    truth_df = truth_df.drop(columns=["altitude", "latitude", "longitude"])
    # Identify columns to copy onto the truth_df from the forecast_df
    forecast_subset = forecast_df[[
        "wmo_id",
        "latitude",
        "longitude",
        "altitude",
        "period",
        "height",
        "cf_name",
        "units",
        "time",
        "diagnostic",
    ]].drop_duplicates()

    # Use "right" to fill in any missing observations in the truth dataframe
    # and retain the order from the forecast_subset.
    truth_df = truth_df.merge(forecast_subset,
                              on=["wmo_id", "time", "diagnostic"],
                              how="right")
    return forecast_df, truth_df
示例#14
0
# from .items import PcautoPowerSpider Item
connection = pymongo.MongoClient('192.168.1.94', 27017)
db = connection["newcar"]
collection = db["pcauto_tmp"]
model_data = collection.find({}, {
    "familyid": 1,
    "brandname": 1,
    "factoryname": 1,
    "familyname": 1,
    "brandid": 1,
    "_id": 0
})

car_msg_list = list(model_data)
car_msg_df = DataFrame(car_msg_list)
car_msg_df_new = car_msg_df.drop_duplicates('familyid')


class PcautoPowerSpider(scrapy.Spider):
    name = 'pcauto_power'
    allowed_domains = ['pcauto.com']
    # start_urls = ['http://pcauto.com/']

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(getattr(
            cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False)
            else 'custom_settings', None) or {},
                         priority='spider')

    def __init__(self, **kwargs):
示例#15
0
db = connection["dasouche"]
collection_city = db["dasouche_city"]
collection_modellist = db["dasouche_modellist"]

model_city = collection_city.find({}, {"cityName": 1, "cityId": 1, "_id": 0})
car_city_list = list(model_city)
city_dic = {data["cityName"]: data["cityId"] for data in car_city_list}
print(city_dic)
model_data = collection_modellist.find({}, {
    "brandName": 1,
    "modelCode": 1,
    "year": 1,
    "_id": 0
})
car_msg_df = DataFrame(list(model_data)).drop_duplicates('modelCode')
num = car_msg_df.drop_duplicates('modelCode')['modelCode'].count()
print(num)

for index, car in car_msg_df.iterrows():
    url_list = list()
    month_now = datetime.now().month
    year_now = datetime.now().year
    for year in range(car["year"] - 1, year_now + 1):
        month = month_now - 1 if year == year_now else month_now
        for city_n, city_i in city_dic.items():
            registerDate = str(year) + "-" + str(month)
            mile = 0.1 if year == year_now else (2 * (year_now - year))
            meta = {
                "model": car["modelCode"],
                "registerDate": registerDate,
                "city_n": city_n,
    "regDate": 1,
    "mile": 1,
    "_id": 0
})

# car_msg_list1 = list(model_data1)[5:500]
# car_msg_df1 = DataFrame(car_msg_list1)
# car_msg_df_new1 = car_msg_df1.drop_duplicates('salesdescid').dropna(axis=0, how='any')
# sid_list = [str(sid).replace('.0', '') for sid in car_msg_df_new1["salesdescid"].values]
# print(sid_list)

car_msg_list3 = list(model_data3)
car_msg_df3 = DataFrame(car_msg_list3)
sid_list = [
    str(sid).replace('.0', '')
    for sid in car_msg_df3.drop_duplicates('salesdescid')["salesdescid"].values
]
print(len(sid_list))

car_msg_list2 = list(model_data2)
car_msg_df2 = DataFrame(car_msg_list2)
car_msg_df_new2 = car_msg_df2[car_msg_df2['salesdescid'].isin(sid_list)]
sid_list2 = [
    str(sid).replace('.0', '') for sid in car_msg_df_new2.drop_duplicates(
        'salesdescid')["salesdescid"].values
]
print(car_msg_df_new2["salesdescid"].count())
print(len(sid_list2))

msg_df_new2 = car_msg_df2[car_msg_df2['salesdescid'].isin(sid_list2)]
msg_df_new3 = car_msg_df3[car_msg_df3['salesdescid'].isin(sid_list2)]
示例#17
0
db = connection["chexiu"]
collection = db["chexiu_car"]
model_data = collection.find({}, {
    "vehicle_id": 1,
    "vehicle": 1,
    "brandname": 1,
    "brand_id": 1,
    "familyname": 1,
    "family_id": 1,
    "factoryname": 1,
    '_id': 0
})

car_msg_list = list(model_data)
car_msg_df = DataFrame(car_msg_list)
car_msg_df_new = car_msg_df.drop_duplicates('vehicle_id')


class ChexiuspiderSpider(scrapy.Spider):
    name = 'chexiuSpider'
    allowed_domains = ['chexiu.com']

    start_urls = [
        'https://sz.chexiu.com/index.php?r=site/api/depList&isshowall=1'
    ]

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(getattr(
            cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False)
            else 'custom_settings', None) or {},
# 获取要跑那块数据
data_f = '2020-05-02'
data_0 = '2020-06-23'
date1 = time.strptime(data_0, '%Y-%m-%d')
now_d = time.strftime('%Y-%m-%d', time.localtime())
date2 = time.strptime(now_d, '%Y-%m-%d')
start_d = datetime(date1[0], date1[1], date1[2])
end_d = datetime(date2[0], date2[1], date2[2])
part_num = (end_d - start_d).days % 28
# part_num = 28 if part_num == 0 else part_num

print(part_num+1)

# # 挑选当天的数据id
# car_msg_df_new = car_msg_df[car_msg_df["part"] == part_num+1]
car_msg_df_new = car_msg_df.drop_duplicates('salesdescid')
print(car_msg_df_new["salesdescid"].count())
#
#

# 更改表名字
connection2 = pymongo.MongoClient('192.168.2.149', 27017)
local_time = time.strftime('%Y-%m-%d', time.localtime())
print(local_time)
db2 = connection2['che300']
collection2 = db2['che300_21_price']

# count = collection2.count()
# if count:
#     print(count)
#     name = 'che300_21_price_' + str(part_num) + '_' + str(getYesterday())
示例#19
0
                collection.rename(name)
        collection3.remove()
        data_list = []
        # data = {"start_num": 0, "end_num": 100}
        data = {"start_num": end_num, "end_num": end_num + 100}
        data_list.append(data)
        collection3.insert(data_list)

        collection1 = db3["che300_split"]

        model_data1 = collection1.find({}, {"brandname": 1, "brandid": 1, "familyid": 1, "salesdescid": 1, "min_reg_year": 1,
                                          "max_reg_year": 1, "part": 1, "_id": 0})

        car_msg_list1 = list(model_data1)
        car_msg_df = DataFrame(car_msg_list1)
        car_msg_df_new = car_msg_df.drop_duplicates('salesdescid').dropna(axis=0, how='any')
        print(car_msg_df_new["salesdescid"].count())

        db2 = connection2["che300"]
        collection2 = db2["che300_queue"]
        model_data2 = collection2.find({}, {"salesdescid": 1,  "_id": 0})

        # start_num = 0
        # end_num = 500
        car_msg_list2 = list(model_data2)
        sid_list = list()
        for i in car_msg_list2[start_num:end_num]:
            sid_list.append(str(i["salesdescid"]).replace('.0', ''))
        # print(sid_list)

        partnerId = ['douyin', 'escsh', 'yanchebang', 'jhhz', 'ynhcj', 'chexiaopang']
示例#20
0
    uri1 = f'mongodb://192.168.2.149:{settings["MONGODB_PORT"]}/'
    # connection = pymongo.MongoClient(uri1, unicode_decode_error_handler='ignore')
    connection = pymongo.MongoClient(uri1)

    db = connection['che300']
    collection = db['che300_price_daily']
    # collection = collection.with_options(codec_options=bson.CodecOptions(unicode_decode_error_handler="ignore"))

    uri2 = f'mongodb://192.168.1.92:{settings["MONGODB_PORT"]}/'
    connection2 = pymongo.MongoClient(uri2)
    db2 = connection2[settings['MONGODB_DB']]
    collection2 = db2['che300_41city_url']
    model_data = collection.find({}, {"url": 1, "_id": 0})

    car_msg_df = DataFrame(list(model_data))
    car_msg_df = car_msg_df.drop_duplicates('url')
    have_num = car_msg_df["url"].count()
    print(f"现有数据量:{have_num}")

    model_data2 = collection2.find({}, {"url": 1, "_id": 0})
    car_msg_df2 = DataFrame(list(model_data2)).drop_duplicates('url')
    all_num = car_msg_df2["url"].count()
    print(f"总共数据量:{all_num}")

    df_a_filter = car_msg_df2[~car_msg_df2['url'].isin(car_msg_df['url'])]
    miss_num = df_a_filter["url"].count()
    print(f"缺少数据量:{miss_num}")

    miss_sid_list = [
        1373449, 1373450, 1373451, 1373455, 1400877, 1400879, 1400880, 1400976,
        1401128, 1401129, 1401130, 1401131, 1401134, 1401137, 1401138, 1401141,