Exemplo n.º 1
0
def get_train_and_val(df, val_prop: float):
    """
    Splits into training and validation set, where validation set has 50% negative edges

    Args:
        df:
        val_prop:

    Returns:

    """
    n_val_samples = int(val_prop * df.shape[0])
    logger.info('Eventual required val samples (proportion: {}): {:,}'.format(
        val_prop, n_val_samples))

    train, val = train_val_split(df, n_val_samples)
    logger.info('Ratio of train to val: {:,}:{:,} ({:.2f})'.format(
        train.shape[0], val.shape[0],
        val.shape[0] / (train.shape[0] + val.shape[0])))

    neg_samples = create_negative_edges(df, val, n_val_samples)

    val = combine_val_and_neg_edges(val, neg_samples)
    train = train[['product1', 'product2', 'weight']].copy()

    return train, val
Exemplo n.º 2
0
def remove_account():
    config_obj = read_config_file()

    if len(config_obj) == 0:
        logger.info("No config found.")
    else:
        available_servers = []
        for i in range(0, len(config_obj)):
            if 'Host' in config_obj[i]:
                available_servers.append({
                    'key': i,
                    'name': config_obj[i]['Host'],
                    'value': i,
                })
        host_ques = [{
            'type':
            'list',
            'name':
            'ssh-hosts',
            'message':
            'Select host to remove:',
            'choices':
            available_servers + retrieve_questions('default_exits')
        }]
        selected_host = prompt(host_ques, style=style)
        default_menu_or_exit(selected_host['ssh-hosts'])

        confirm_removal = prompt(retrieve_questions('confirm_remove'),
                                 style=style)
        if confirm_removal['remove_confirmation']:
            config_obj.pop(selected_host['ssh-hosts'])
            write_config_to_file(config_obj)
            logger.info("Finished removing config.")
        else:
            init()
Exemplo n.º 3
0
 def run(self):
     try:
         logger.info("start wuhan vol crawler.")
         self.parse()
         logger.info("end wuhan vol crawler.")
     except Exception, e:
         logger.error("error", e)
Exemplo n.º 4
0
    def warm_up(self, scaler, model, dataloader, cfg, prefix='train'):
        optimizer = build_optimizer(cfg, model)
        model.train()

        cur_iter = 0
        while cur_iter < cfg.WARMUP.ITERS:
            for i, sample in enumerate(dataloader):
                cur_iter += 1
                if cur_iter >= cfg.WARMUP.ITERS:
                    break
                lr = get_warmup_lr(cur_iter, cfg)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                losses = self.run_step(scaler, model, sample, optimizer, None, None, prefix)

                if self.cfg.local_rank == 0:
                    template = "[iter {}/{}, lr {}] Total train loss: {:.4f} \n" "{}"
                    logger.info(
                        template.format(
                            cur_iter, cfg.WARMUP.ITERS, round(get_current_lr(optimizer), 6),
                            losses["loss"].item(),
                            "\n".join(
                                ["{}: {:.4f}".format(n, l.item()) for n, l in losses.items() if n != "loss"]),
                        )
                    )
        del optimizer
Exemplo n.º 5
0
def process_sample_set(obj_ver_key: str, obj_data: dict) -> None:
    """
    obj_ver_key: object version key
    obj_data: object data
    """
    # term_bank dictionary for storing arango document information about
    # already encountered terms. mapping of ontology_term -> arango "_id" field
    term_bank: Dict[str, str] = {}
    edges: List[dict] = []
    # iterate per sample
    for sample_info in obj_data['data']['samples']:
        # retrieve the sample metadata
        sample = _get_sample(sample_info)
        sample_version_uuid = _get_sample_version_uuid(sample)
        # term_bank object and edges list passed by reference
        # find terms we know are ontology terms
        _generate_link_information(sample, sample_version_uuid, edges,
                                   term_bank)
    # add creation timestamp for edge link, (same for all edges).
    created_timestamp = _now_epoch_ms() + 20 * len(
        edges)  # allow 20 ms to transport & save each edge
    for e in edges:
        e['created'] = created_timestamp
    logger.info(f'Writing {len(edges)} sample -> ontology edges '
                f'for samples in SampleSet {obj_ver_key}')
    # save link in bulk operation
    _save(SAMPLE_ONTOLOGY_COLL, edges)
Exemplo n.º 6
0
def get_edges(df):
    """
    Returns a dataframe of products and the weights of the edges between them.

    Args:
        df:

    Returns:

    """
    logger.info('Relationship distribution: \n{}'.format(df['relationship'].value_counts()))

    df = create_product_pair(df, col_list=['asin', 'related'])
    logger.info('Product pairs created')

    df = get_relationship_weights(df, relationship_weights)
    logger.info('Relationship weights updated')

    # Aggregate to remove duplicates
    logger.info('Original no. of edges: {:,}'.format(df.shape[0]))
    df = df.groupby('product_pair').agg({'weight': 'sum'}).reset_index()
    logger.info('Deduplicated no. of edges: {:,}'.format(df.shape[0]))

    # Save edge list
    df['product1'], df['product2'] = zip(*df['product_pair'].apply(split_product_pair))

    df = df[['product1', 'product2', 'weight', 'product_pair']]
    return df
Exemplo n.º 7
0
def remove_account():
    config_obj = read_config_file()

    if len(config_obj) == 0:
        logger.info("No config found.")
    else:
        host_ques = [{
            'type':
            'list',
            'name':
            'git-hosts',
            'message':
            'Select host to remove:',
            'choices':
            list(config_obj.keys()) + retrieve_questions('default_exits')
        }]
        selected_host = prompt(host_ques, style=style)
        default_menu_or_exit(selected_host['git-hosts'])

        confirm_removal = prompt(retrieve_questions('confirm_remove'),
                                 style=style)
        if confirm_removal['remove_confirmation']:
            del config_obj[selected_host['git-hosts']]
            write_config_to_file(config_obj)
            logger.info("Account removal successful.")
        else:
            init()
Exemplo n.º 8
0
def delete_obj(msg):
    """
    Checks that the received object is deleted, since the workspace object
    delete event can refer to either delete or undelete state changes.
    """
    wsid = msg['wsid']
    objid = msg['objid']
    if not check_object_deleted(wsid, objid):
        # Object is not deleted
        logger.info(f'Object {objid} in workspace {wsid} is not deleted')
        return
    # Perform the deletion
    query = {
        'bool': {
            'must': [
                {'term': {'access_group': wsid}},
                {'term': {'obj_id': objid}}
            ]
        }
    }
    json_body = json.dumps({'query': query})
    # Perform the delete_by_query using the elasticsearch http api.
    resp = requests.post(
        f"{_ES_URL}/{_IDX}/_delete_by_query",
        params={'conflicts': 'proceed'},
        data=json_body,
        headers={"Content-Type": "application/json"}
    )
    if not resp.ok:
        # Unsuccesful request to elasticsearch.
        raise RuntimeError(f"Error deleting object on elasticsearch:\n{resp.text}")
    logger.info(f"Deleted elasticsearch documents associated with obj {wsid}/{objid}")
Exemplo n.º 9
0
def _generate_features(obj_ver_key, obj_data):
    d = obj_data['data']
    if not d.get('features'):
        logger.info(f'Genome {obj_ver_key} has no features')
        return
    verts = []
    edges = []
    wsid = obj_data['info'][6]
    objid = obj_data['info'][0]
    ver = obj_data['info'][4]
    # might want to do this in smaller batches if memory pressure is an issue
    for f in d['features']:
        feature_key = _clean_key(f'{obj_ver_key}_{f["id"]}')
        verts.append({
            '_key': feature_key,
            'workspace_id': wsid,
            'object_id': objid,
            'version': ver,
            'feature_id': f['id']
        })
        edges.append({
            '_key': f'{feature_key}',  # make a unique key so overwrites work
            '_from': f'{_OBJ_VER_COLL}/{obj_ver_key}',
            '_to': f'{_WS_FEAT_COLL}/{feature_key}'
        })
    logger.info(f'Saving {len(verts)} features for genome {obj_ver_key}')
    # hmm, this could leave the db in a corrupt state... options are 1) rollback 2) retry 3) leave
    # rollback is kind of impossible as an error here implies the re api isn't reachable
    # retry is doable, but should probably be implemented much higher in the stack
    # So 3 for now
    # reindexing will overwrite and fix
    _save(_WS_FEAT_COLL, verts)
    _save(_WS_FEAT_EDGE_COLL, edges)
Exemplo n.º 10
0
    def __init__(self, edge_path: str, val_path: str, power: float = 0.75):
        """
        Initializes an Edges object for use in a Dataset.

        Args:
            edge_path: Path to numpy array of sequences, where each row is a sequence
            power: Negative sampling parameter; suggested 0.75
        """
        self.power = power
        self.negative_idx = 0
        self.n_unique_tokens = 0

        self.edges = pd.read_csv(edge_path)
        self.n_edges = len(self.edges)
        logger.info('Edges loaded (length = {:,})'.format(self.n_edges))

        self.val = pd.read_csv(val_path)
        logger.info('Validation set loaded: {}'.format(self.val.shape))

        self.product_set = self.get_product_set()
        self.word2id, self.id2word = self.get_mapping_dicts()
        self.get_product_id_func = np.vectorize(self.get_product_id)
        self.n_unique_tokens = len(self.word2id)
        logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens))
        save_model(self.word2id, '{}/word2id_edge'.format(MODEL_PATH))
        save_model(self.id2word, '{}/id2word_edge'.format(MODEL_PATH))
        logger.info('Word2Id and Id2Word created and saved')

        # Convert product ID strings to integers
        self.edges = self.prep_edges()
        logger.info('Edges prepared')

        # Prepare negative sampling table
        self.word_freq = self.get_word_freq(self.edges[:, :2])
        self.neg_table = self.get_negative_sample_table(self.power)
Exemplo n.º 11
0
def train_embeddings(sequences,
                     workers,
                     dimension=128,
                     window=5,
                     min_count=1,
                     negative=5,
                     epochs=3,
                     seed=42):
    # Logging specific to gensim training
    import logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # Initialize model
    model = Word2Vec(sequences,
                     workers=workers,
                     size=dimension,
                     window=window,
                     min_count=min_count,
                     negative=negative,
                     seed=seed)
    logger.info('Model initialized')

    # Train model (No need to retrain model as initialization includes training)
    # model.train(sequences, total_examples=len(sequences), epochs=epochs)
    # logger.info('Model trained!')

    return model
Exemplo n.º 12
0
def _generate_GO_links(obj_ver_key, obj_data):
    d = obj_data['data']
    if not d.get('features'):
        # no features logged already in _generate_features
        return
    f_to_go = {}
    for f in d['features']:
        # this works for Genome-8.2 to 10.0 in production
        if _ONTOLOGY_TERMS in f and _ONTOLOGY_GO_KEY in f[_ONTOLOGY_TERMS]:
            f_to_go[f['id']] = f[_ONTOLOGY_TERMS][_ONTOLOGY_GO_KEY].keys()
    terms_set = {i for items in f_to_go.values() for i in items}  # flatten
    query_time = _now_epoch_ms()
    # might want to do this in smaller batches if memory pressure is an issue
    resolved_terms = _resolve_GO_terms(terms_set, query_time)
    edges = []
    for f in f_to_go:
        for g in f_to_go[f]:
            if g not in resolved_terms:
                logger.info(f"Couldn't resolve GO term {g} in Genome {obj_ver_key} feature {f}")
            else:
                featurekey = _clean_key(f'{obj_ver_key}_{f}')
                edges.append({
                    '_key': f'{featurekey}::{resolved_terms[g]}::kbase_RE_indexer',
                    '_from': f'{_WS_FEAT_COLL}/{featurekey}',
                    '_to': f'{_GO_TERM_COLL}/{resolved_terms[g]}',
                    'source': 'kbase_RE_indexer',
                    'expired': _MAX_ADB_INTEGER
                })
    created_time = _now_epoch_ms() + 20 * len(edges)  # allow 20 ms to transport & save each edge
    for e in edges:
        e['created'] = created_time
    logger.info(f'Writing {len(edges)} feature -> GO edges for genome {obj_ver_key}')
    _save(_WS_FEAT_TO_GO_COLL, edges, on_duplicate='ignore')
    def __init__(self, emb_sizes, emb_dim):
        super().__init__()
        self.emb_sizes = emb_sizes
        self.emb_dim = emb_dim

        # Create embedding layers
        self.center_embeddings = nn.ModuleList()
        for k, v in self.emb_sizes.items():
            self.center_embeddings.append(nn.Embedding(v, emb_dim,
                                                       sparse=True))

        self.context_embeddings = nn.ModuleList()
        for k, v in self.emb_sizes.items():
            self.context_embeddings.append(
                nn.Embedding(v, emb_dim, sparse=True))

        # Create embedding weighting layer
        self.emb_weights = nn.Embedding(
            emb_sizes['product'], len(emb_sizes),
            sparse=True)  # emb_sizes['product'] is total number of products
        self.emb_weights_softmax = nn.Softmax(dim=1)

        self.init_emb()

        logger.info('Model initialized: {}'.format(self))
def get_raw_data(path_raw_data,
                 group_columns,
                 date_column='WorkingDate',
                 target_column=config.TARGET):
    logger.info(f"Downloading raw data from {path_raw_data}")

    all_columns = group_columns.copy()
    all_columns.append(target_column)
    all_columns.append(date_column)

    raw_data = pd.read_csv(Path(path_raw_data), sep=";")
    raw_data[date_column] = pd.to_datetime(raw_data[date_column])
    raw_data = raw_data[all_columns]

    filled_raw_data = (raw_data.set_index(date_column).groupby(
        group_columns).apply(lambda d: d.reindex(
            pd.date_range(min(raw_data[date_column]),
                          max(raw_data[date_column]),
                          freq='D'))).drop(
                              group_columns,
                              axis=1).reset_index(group_columns).fillna(0))

    filled_raw_data = filled_raw_data.reset_index()
    filled_raw_data = filled_raw_data.rename(columns={'index': date_column})

    return filled_raw_data
def create_random_walk_samples(node_dict,
                               transition_dict,
                               samples_per_node=10,
                               sequence_len=10):
    random.seed(42)
    n_nodes = len(node_dict)

    sample_array = np.zeros((n_nodes * samples_per_node, sequence_len),
                            dtype=int)
    logger.info('Sample array shape: {}'.format(sample_array.shape))

    # For each node
    for node_idx in range(n_nodes):

        if node_idx % 100000 == 0:
            logger.info('Getting samples for node: {:,}/{:,}'.format(
                node_idx, n_nodes))

        # For each sample
        for sample_idx in range(samples_per_node):
            node = node_idx

            # For each event in sequence
            for seq_idx in range(sequence_len):
                sample_array[node_idx * samples_per_node + sample_idx,
                             seq_idx] = node
                node = random.choices(
                    population=transition_dict[node]['product'],
                    weights=transition_dict[node]['probability'],
                    k=1)[0]

    return sample_array
Exemplo n.º 16
0
def run_importer(obj, ws_info, msg):
    start = time.time()
    type_, _ = obj['info'][2].split('-')  # 2nd var is version
    if type_ in config()['global']['ws_type_blacklist']:
        logger.info(f'Skipped RE import of blacklisted type {type_}')
    else:
        import_object(obj, ws_info)
        logger.info(f"Imported an object into RE in {time.time() - start}s.")
def get_forecast_calendar(path_forecast_calendar, date_column='WorkingDate'):
    logger.info(f"Downloading forecast calendar from {path_forecast_calendar}")

    forecast_calendar = pd.read_excel(Path(path_forecast_calendar))
    forecast_calendar[date_column] = pd.to_datetime(
        forecast_calendar[date_column])

    return forecast_calendar
Exemplo n.º 18
0
def _fetch_global_config(config_url):
    """
    Fetch the index_runner_spec configuration file from a URL to a yaml file.
    """
    logger.info(f'Fetching config from url: {config_url}')
    # Fetch the config directly from config_url
    with urllib.request.urlopen(config_url) as res:  # nosec
        return yaml.safe_load(res.read())
Exemplo n.º 19
0
def get_categories(df: pd.DataFrame) -> pd.DataFrame:
    df['category_lvl_1'] = df['categories'].apply(get_category_lvl, args=(0, ))
    df['category_lvl_2'] = df['categories'].apply(get_category_lvl, args=(1, ))
    df['category_lvl_3'] = df['categories'].apply(get_category_lvl, args=(2, ))
    df['category_lvl_4'] = df['categories'].apply(get_category_lvl, args=(3, ))
    logger.info('Categories lvl 1 - 4 prepared')

    return df
Exemplo n.º 20
0
 def run(self):
     try:
         logger.info("start bj gov crawler.")
         self.get_wangqian()
         self.get_history()
         logger.info("end bj gov crawler.")
     except Exception, e:
         logger.error("error", e)
def cross_validate_model(training_df: pd.DataFrame,
                         rf: RandomForestRegressor) -> None:
    # cross validation
    scores = cross_val_score(rf,
                             training_df.drop(['revenue'], axis=1),
                             training_df['revenue'],
                             cv=5,
                             scoring='neg_mean_squared_log_error')
    logger.info(scores)
Exemplo n.º 22
0
    def start_consumers(self):
        logger.info('Starting Consumers...')

        try:
            self._channel.start_consuming()
        except KeyboardInterrupt:
            self._channel.stop_consuming()

        rbmq.conn.close()
Exemplo n.º 23
0
 def run(self):
     try:
         logger.info("start lianjia crawler")
         self.craw_stat()
         self.craw_open()
         self.crawPriceTrends()
         logger.info("end lianjia crawler")
     except Exception, e:
         logger.error("error", e)
Exemplo n.º 24
0
    def __init__(self, depth_limit: int = 99):
        """Initializes a decision tree with depth limit

        Args:
            depth_limit: Maximum depth to build the tree
        """
        self.root = None
        self.depth_limit = depth_limit
        logger.info('{} initialized with depth limit: {}'.format(self.__class__.__name__, depth_limit))
def load_network(edgelist_path):
    graph = networkx.read_weighted_edgelist(edgelist_path)
    logger.info('No of nodes ({:,}) and edges ({:,})'.format(
        graph.number_of_nodes(), graph.number_of_edges()))

    # Get dictionary mapping of integer to nodes
    node_dict = {i: key for i, key in enumerate(graph.nodes.keys())}

    return graph, node_dict
Exemplo n.º 26
0
    def wrapper(*args, **kwargs):
        if 'Api-Key' not in request.headers:
            return {'status':'error', 'msg': 'Api-Key not present on Request Headers'}, http.HTTPStatus.BAD_REQUEST

        if request.headers['Api-Key'] != apikey.API_KEY:
            logger.info('Error: ApiKey not valid')
            return {'status':'error', 'msg': 'ApiKey not valid'}, http.HTTPStatus.UNAUTHORIZED

        return fn(*args, **kwargs)
Exemplo n.º 27
0
def close_consumer(consumer: Consumer) -> None:
    """
    This will close the network connections and sockets. It will also trigger
    a rebalance immediately rather than wait for the group coordinator to
    discover that the consumer stopped sending heartbeats and is likely dead,
    which will take longer and therefore result in a longer period of time in
    which consumers can’t consume messages from a subset of the partitions.
    """
    consumer.close()
    logger.info("Closed the Kafka consumer")
Exemplo n.º 28
0
 def run(self):
     try:
         logger.info("start hangzhou vol crawler.")
         prev_url = self.get_month_vol(datetime.date.today() - datetime.timedelta(days=1), None)
         while prev_url:
             prev_url = self.get_month_vol(None, prev_url)
             time.sleep(6)
         logger.info("end hangzhou vol crawler.")
     except:
         logger.error("error")
Exemplo n.º 29
0
def add_account(email, username, hostname):
    config_obj = read_config_file()

    config_obj[hostname] = {
        "hostName": hostname,
        "name": username,
        "email": email
    }
    write_config_to_file(config_obj)
    logger.info("Finished adding account.")
Exemplo n.º 30
0
 def run(self):
     try:
         logger.info("start draw img.")
         self.draw_vol_1days()
         self.draw_vol_7days()
         self.draw_vol_monthly()
         self.draw_price_trends()
         logger.info("end draw img.")
     except:
         logger.error("error.")
Exemplo n.º 31
0
def start_service(wait_for_url, wait_for_name):
    global container_process
    global container_out
    global container_err

    cmd = "docker-compose --no-ansi up"
    logger.info(f'Running command:\n{cmd}')
    container_out = open("container.out", "w")
    container_err = open("container.err", "w")
    container_process = subprocess.Popen(cmd, shell=True, stdout=container_out, stderr=container_err)
    wait_for_service(wait_for_url, wait_for_name)
Exemplo n.º 32
0
def collect_samples(item_array, sample_size, n_samples):
    samples = []

    for i in range(0, n_samples):
        if i % 1000000 == 0:
            logger.info('Neg sample: {:,}'.format(i))

        sample = get_sample(item_array, n_iter=i, sample_size=sample_size)
        samples.append(sample)

    return samples
Exemplo n.º 33
0
def _pull_docker_image(image):
    """check if image exists, if not pull it."""
    li = _DOCKER.images.list()
    pulled = False
    for im in li:
        if image in im.tags:
            # id_ = im.id
            pulled = True
    if not pulled:
        logger.info("Pulling %s" % image)
        _DOCKER.images.pull(image)
Exemplo n.º 34
0
    def insert(self, data):
        logger.info(f'Insert data on {self.config.CALC_QUEUE}')

        conn = self._set_connection()
        channel = conn.channel()
        channel.queue_declare(queue=self.config.CALC_QUEUE)
        channel.basic_publish(exchange='',
                              routing_key=self.config.CALC_QUEUE,
                              body=json.dumps(data.serialize()))
        conn.close()

        logger.info(f'Data inserted')
Exemplo n.º 35
0
    def val_epoch(self, epoch, model, dataloader, optimizer=None, lr_scheduler=None, prefix="val"):
        model.eval()

        lossMeter = LossMeter()
        perfMeter = PerfMeter()

        with torch.no_grad():
            for (imgs, labels) in dataloader:

                if self.cfg.HALF:
                    imgs = imgs.half()

                if len(self.device)>1:
                    losses, performances = data_parallel(model, (imgs, labels, prefix), device_ids=self.device,
                                                         output_device=self.device[-1])
                else:
                    imgs = imgs.cuda()
                    labels = [label.cuda() for label in labels] if isinstance(labels,list) else labels.cuda()
                    losses, performances = model(imgs, labels, prefix)

                lossMeter.__add__(losses)
                perfMeter.__add__(performances)

                del imgs, labels, losses, performances

        avg_losses = lossMeter.average()
        avg_perf = perfMeter.average()

        template = "[epoch {}] Total {} loss : {:.4f} " "\n" "{}"
        logger.info(
            template.format(
                epoch,prefix,avg_losses["all_loss"],
                "\n".join(["{}: {:.4f}".format(n, l) for n, l in avg_losses.items() if n != "all_loss"]),
            )
        )

        if self.cfg.TENSORBOARD:
            # Logging val Loss
            [self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l, epoch) for n, l in avg_losses.items()]
            # Logging val performances
            [self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v, epoch) for k, v in avg_perf.items()]

        perf_log_str = f"\n------------ Performances ({prefix}) ----------\n"
        for k,v in avg_perf.items():
            perf_log_str += "{:}: {:.4f}\n".format(k, v)
        perf_log_str += "------------------------------------\n"
        logger.info(perf_log_str)

        acc = avg_perf['all_perf']

        del avg_losses, avg_perf
        return acc
Exemplo n.º 36
0
 def run(self):
     try:
         logger.info("start shanghai vol crawler.")
         html = download(self.url)
         soup = BeautifulSoup(html, "html.parser")
         soup.find("div", {})
         match = re.findall("出售各类商品房<b>(\\d+)</b>套", html)
         if match:
             vol = match[0]
             ds = re.findall("今日楼市((\\d+)-(\\d+)-(\\d+))", html)
             date = datetime.now().replace(year=int(ds[0][0]), month=int(ds[0][1]), day=int(ds[0][2]))
             info = {"city": "上海", "district": "sh", "total": vol, "zhuzai": 0, "date": date}
             has = self.dao.has_item("sh", date)
             if not has[0]:
                 self.dao.insert_item(info)
         logger.info("end shanghai vol crawler.")
     except Exception, e:
         logger.error(e)
Exemplo n.º 37
0
    def get_month_vol(self, dt, url):
        if not url:
            url = self.url_pattern % dt.strftime('%Y%m%d')
        else:
            match = re.findall("(\\d{8})", url)
            if not match:
                return None
            dt = datetime.datetime(*time.strptime(match[0], '%Y%m%d')[:6])
        logger.info(url)
        html = download(url, charset="utf-8")
        if not html:
            return None
        new_vols = re.findall("ss1\.push\((\\d+)\);", html)
        old_vols = re.findall("ss2\.push\((\\d+)\);", html)
        days = re.findall("tickss\.push\((\\d+)\);", html)
        if len(new_vols) != len(old_vols) or len(days) != len(new_vols):
            logger.info(new_vols)
            logger.info(old_vols)
            logger.info(days)
            return
        for i in range(0, len(days), 1):
            vol_date = dt.replace(dt.year, dt.month, int(days[i]))
            city = "杭州"
            district = "杭州"
            has = self.dao.has_item(district, vol_date)
            if not has[0] and int(new_vols[i]) > 0 and int(old_vols[i]) > 0:
                self.dao.insert_item({"city": city, "district": district,
                                      "total": new_vols[i], "zhuzai": old_vols[i],
                                      "date": vol_date})

        soup = BeautifulSoup(html, "html.parser")
        div_date = soup.find("div", {"class": "date"});
        if div_date:
            if div_date.find("a"):
                path = div_date.find("a")["href"]
                if path:
                    return self.host + path

        return None
Exemplo n.º 38
0
                    old = self.dao.get_item(row["city"], row["district"], row["date"])
                    if not old:
                        self.dao.insert_item(row)
                    else:
                        self.dao.update_item(city, "月趋势", 0, price, last)
                    month -= 1
                    if month == 0:
                        year -= 1
                        month = 12
                    last = datetime(year, month, 1)

    def run(self):
        try:
            logger.info("start lianjia crawler")
            self.craw_stat()
            self.craw_open()
            self.crawPriceTrends()
            logger.info("end lianjia crawler")
        except Exception, e:
            logger.error("error", e)

if __name__ == "__main__":
    file_dir = os.path.dirname(os.path.abspath(__file__))
    db_path = ''.join([file_dir, "/../../db/house.db"])
    logger.info("start.")
    dao = LianjiaDAO(datetime.today(), db_path)
    tool = Lianjia(dao)
    tool.crawPriceTrends()
    logger.info("end.")