def extract(self): '''对采集到的网页进行数据提取''' self.db = MySQL('job') jobs = self.findall('div.job-list>ul>li') for job in jobs: title = self.find('.job-title',job).text salary = self.find('.red',job).text job_link = self.find('.info-primary>h3.name>a',job).get_attribute('href') job_id = re.search(r'/job_detail/(.*).html',job_link).group(1) job_info = self.find('.info-primary>p',job).get_attribute('innerHTML') company_name = self.find(".info-company>div>h3>a",job).text company_link = self.find(".info-company>div>h3>a",job).get_attribute('href') company_id = re.search(r'/gongsi/(.*).html',company_link).group(1) html_str = self.find('.company-text>p',job).get_attribute('innerHTML') company_info = html_str.split('''<em class="vline"></em>''') company_industry = company_info[0] company_size = company_info.pop() publisher = self.find('.info-publis>h3',job).get_attribute('innerHTML').split('<em class="vline"></em>').pop() date_str = self.find('.info-publis p',job).text date_str = date_str.split('发布于')[1] try: pub_date = datetime.datetime.strptime(date_str,'%m月%d日') pub_date = pub_date.replace(year = 2019) except ValueError: pub_date = datetime.datetime.strptime('11月18日','%m月%d日') pub_date = pub_date.replace(year = 2019) data = dict(title=title,salary=salary,job_info=job_info.replace('\"','\''),job_id=job_id,company_name=company_name\ ,company_id=company_id,company_industry=company_industry,company_size=company_size\ ,publisher=publisher,pub_date=str(pub_date)) self.write(data)
def genVocab(vocabfile): mysql=MySQL() mysql.login() cursor=mysql.get_cursor() vocab=defaultdict(int) def imdict(ab): for a in ab.split(" "): a=a.strip() # 去掉全是小写的英文单词 if len(a)==0 or (rec.match(a) and a.islower()) or (rec0.match(a)): continue vocab[a]+=1 urlset=set() dalist = [] tables=["news","crawldata"] for table in tables: sent="select title,brief,content,url from %s where 1"%table cursor.execute(sent) for title, brief, content,url in cursor.fetchall(): if url in urlset: continue else: urlset.add(url) title = Data.extract_html(title,False) imdict(title) if table=="news" and brief is not None: brief= re.sub("摘要:","",brief) brief = Data.extract_html(brief,False) imdict(brief) brieflen=len(brief) else:brieflen=0 content=re.sub("资料图(图源:.*?)","",content) try: content=Data.extract_html(content) except: continue time.sleep(0.1) imdict(content) contentlen=len(content) dalist.append([brieflen,contentlen]) data = pd.DataFrame(columns=["brief", "content"],data=dalist) data=data[data['brief']>0] data.to_csv("./data/len.csv",index=False) mysql.close() newvocab={Data.UNKNOWN_TOKEN:0,Data.PAD_TOKEN:-1,Data.SENTENCE_START:-1,Data.SENTENCE_END:-1} for key, value in vocab.items(): if value >= 5: newvocab.update({key:value}) else: newvocab[Data.UNKNOWN_TOKEN]+=value with open(vocabfile,'w') as f: for word,num in newvocab.items(): f.write(word+" "+str(num)+"\n")
def __init__(self): self._headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Sec-Fetch-Mode': 'no-cors', 'Host': 'arxiv.org' } self._sess = requests.Session() self._sleep_time = 5 self._mysql = MySQL()
def run(self): args = self.arguments() if (args.filenames): csvobject = CSVParser(args.filenames[0], args.filenames[1]) csvobject.format_csv_files() elif (args.output): print('We are connecting to the database...') db = MySQL() # Here is where you would pass in a MySQL connection db.write_to_file('SELECT * from foobar', args.output[0]) else: assert False, "Unhandled"
def __init__(self): self.user = sys.argv[1] self.password = sys.argv[2] self.timeframe = sys.argv[3] self.base_url = "%simportant" % (url) self.test_page = test_page self.test_page_cont = None self.art_data = {} self.instruments_dict = {} self.trades = {} self.all_trade_ideas = {} self.db = MySQL()
def Authentication(self): path = MainCore() file = configparser.ConfigParser() file.read(path.resource('config.ini')) operation = MySQL(file.get('DATABASE', 'host'), file.get('DATABASE', 'user'), file.get('DATABASE', 'password'), file.get('DATABASE', 'database')) result = operation.user_login(self.user_name.text(), self.password.text()) if len(list(result[1])) == 0: self.label_error.setStyleSheet('color: red') self.label_error.setText('Access denied !') else: permission = result[1][0] RootWindow.show() self.close()
def ExampleGen(num_epochs=None): epoch = 0 mysql=MySQL(sqldb="HWCom") mysql.login() cursor=mysql.get_cursor() while True: if num_epochs is not None and epoch >= num_epochs: break sent="select title,brief,content from news where brief !=''" cursor.execute(sent) for rows in cursor.fetchall(): title, brief, content=rows content=extract_html(content) brief=extract_html(brief,False) yield (title,content,brief) epoch += 1
def create_db(self): def field(): for i in list(self.get_config): if i == '': self.status_label.setStyleSheet("color: red;") self.status_label.setText('Empty field !') return False else: return True if field(): db = MySQL(self.get_config[0], self.get_config[1], self.get_config[2], self.get_config[3]) db.CreateDatabase() if db.CreateTables(): self.status_label.setStyleSheet("color: green;") self.status_label.setText('Success Connect') self.save_btn.setEnabled(True) else: self.status_label.setStyleSheet("color: red;") self.status_label.setText("Access denied!")
def main(): # verify that the necessary files exist battletag_from_cli = [] if len(sys.argv) == 1: try: verify_files_exists(REGION_CODES) except FileNotFoundError: exit(1) elif len(sys.argv) == 2: if not os.path.exists(sys.argv[1]): Log.write_log_message("Specified file does not exist, exiting...", True) btags = open(sys.argv[1], "r") for btag in btags: battletag_from_cli.append(btag.strip()) # get the API request parameters request_parameters = get_request_parameters() # get the current season ID season_id = -1 try: season_id = API.get_current_season_id(request_parameters) except RequestError as e: print(e) exit(1) Log.write_log_message("Current Season ID: {}".format(season_id)) db_handle = MySQL() for region in REGION_CODES: Log.write_log_message("Starting {} Region".format(region.upper())) # get ladders ladders = API.get_all_ladders(region, MAX_LEAGUE_ID, season_id, request_parameters) Log.write_log_message("Total Ladders Found: {}".format(len(ladders))) # add all of the ladders to the database try: add_ladders_to_database(db_handle, ladders) except MySQLdb.IntegrityError: Log.write_log_message( "Ladders are already in database for {}".format( region.upper())) # read in btags to a list if len(battletag_from_cli) == 0: battletags = get_battletags(region) else: battletags = battletag_from_cli num_battletags = len(battletags) Log.write_log_message("Battletags Read In: {}".format(num_battletags)) # go through every ladder looking for one of our players for ladder in ladders: # loop through every ladder between bronze and diamond # get all of the players in the ladder players = API.get_players_in_ladder(region, ladder, request_parameters) for player in players: # loop through every player in the ladder if [battletag.lower() for battletag in battletags ].__contains__(player.battletag.lower()): # a JSL contestant was found db_handle.add_player(player) for team in player.ladders: db_handle.add_race(player, team) for team in player.ladders: Log.write_log_message( "Found player: {} [{} {} {}]".format( player.battletag, team.league, team.divison, team.race)) # get all players in database Log.write_log_message("Writing valid player data to disk") valid_players = db_handle.get_all_valid_players() write_valid_players(valid_players) # close database db_handle.close()
def get_product(book, gui): '''Get product form Woo''' dictionary = {} try: # Get product ID from DB. mysql_request = MySQL(isbn=book) mysql_response = mysql_request.db_mysql() if mysql_response: product = WooCommerce(book=book) request = product.get_woo_product(mysql_response) if request: try: dictionary["id"] = request["id"] except Exception as error: # pylint: disable=broad-except logger.info(error) try: for attribute in request["attributes"]: dictionary[product.get_translation(attribute["name"], "en")] = product.list_expander(attribute["options"]).replace("amp;", "") # pylint: disable=line-too-long except Exception as error: # pylint: disable=broad-except logger.info(error) try: dictionary["name"] = request["name"] except Exception as error: # pylint: disable=broad-except logger.info(error) try: dictionary["description"] = request["description"] except Exception as error: # pylint: disable=broad-except logger.info(error) try: categories_list = [] categories = request["categories"] for category in categories: categories_list.append(category["name"].replace( "amp;", "")) dictionary["categories"] = categories_list except Exception as error: # pylint: disable=broad-except logger.info(error) try: tags_list = [] tags = request["tags"] for tag in tags: tags_list.append(tag["name"].replace("amp;", "")) dictionary["tags"] = tags_list except Exception as error: # pylint: disable=broad-except logger.info(error) try: dictionary["image"] = request["images"][0]["src"] except Exception as error: # pylint: disable=broad-except logger.info(error) try: dictionary["price"] = request["regular_price"] except Exception as error: # pylint: disable=broad-except logger.info(error) try: dictionary["sale_price"] = request["sale_price"] except Exception as error: # pylint: disable=broad-except logger.info(error) try: dictionary["amount"] = request["stock_quantity"] except Exception as error: # pylint: disable=broad-except logger.info(error) for key in gui: if "_box" in key and gui[key]: if key.split('_box')[0] not in dictionary: dictionary[key.split('_box')[0]] = None if dictionary[key.split('_box')[0]] == "": dictionary[key.split('_box')[0]] = None except Exception as error: # pylint: disable=broad-except logger.info(error) return dictionary
def post_woo_products(self): '''Post WooCommerce product''' try: # Auth auth = self.get_woo_request() # Upload image to media image = wp(self.book["image"]) data = { "name": self.book["name"], "description": self.book["description"], "sku": self.book["isbn"], "categories": [], "tags": [], "attributes": [ { "id": 1, "name": "Tytuł", # cspell: disable-line "position": 1, "visible": True, "variation": True, "options": [self.book["title"]] }, { "id": 2, "name": "Autor", # cspell: disable-line "position": 2, "visible": True, "variation": True, "options": [self.book["authors"]] }, { "id": 3, "name": "Wydawnictwo", # cspell: disable-line "position": 3, "visible": True, "variation": True, "options": [self.book["publisher"]] }, { "id": 4, "name": "Rok wydania", # cspell: disable-line "position": 4, "visible": True, "variation": True, "options": [self.book["publish_date"]] }, { "id": 5, "name": "Okładka", # cspell: disable-line "position": 5, "visible": True, "variation": True, "options": [self.book["binding"]] }, { "id": 6, "name": "ISBN", "position": 6, "visible": True, "variation": True, "options": [self.book["isbn"]] } ] } # Tags try: if self.book["tags"]: tags = self.validate_tags() for tag in tags: data["tags"].append({'id': tag}) except Exception as error: # pylint: disable=broad-except logger.info(error) # Image try: if image: data["images"] = [{"src": image}] except Exception as error: # pylint: disable=broad-except logger.info(error) # Price try: if self.book["price"]: data["regular_price"] = self.book["price"] except Exception as error: # pylint: disable=broad-except logger.info(error) # Sale Price try: if self.book["sale_price"]: data["sale_price"] = self.book["sale_price"] except Exception as error: # pylint: disable=broad-except logger.info(error) # Amount try: if self.book["amount"]: data["manage_stock"] = True data["stock_quantity"] = self.book["amount"] except Exception as error: # pylint: disable=broad-except logger.info(error) # Get category ID try: categories = self.validate_category() for category in categories: data["categories"].append({'id': category}) except Exception as error: # pylint: disable=broad-except logger.info(error) # Send request response = auth.post("products", data).json() # Send none if status code found in error codes if "data" in response: if response.get("data", {}).get("status") in self.error_codes: self.error_catch.append(inspect.getouterframes(inspect.currentframe())[0].function) # pylint: disable=line-too-long return None # Format output try: output = { 'id': response["id"], 'name': response["name"], 'link': response["permalink"], 'source': False } except Exception as error: # pylint: disable=broad-except logger.info(error) if response["data"]["status"] == 400: try: mysql_request = MySQL(isbn=self.book["isbn"]) request = mysql_request.db_mysql() except Exception as error: # pylint: disable=broad-except logger.info(error) if request: product = self.get_woo_product(request) if product["stock_quantity"]: data["stock_quantity"] = int(data["stock_quantity"]) + product["stock_quantity"] # pylint: disable=line-too-long try: response = self.update_woo_products( product["id"], data) output = { 'id': response["id"], 'name': response["name"], 'link': response["permalink"], 'source': True } return output except Exception as error: # pylint: disable=broad-except logger.info(error) else: return None except Exception as error: # pylint: disable=broad-except logger.info(error) return output