def guess_sport(data, full_text): data_lc = clean_text(str(data)).lower() print(data_lc) kickboxing_refs1 = len(re.findall('kickbox', data_lc)) boxing_refs1 = len(re.findall('[\W.,]+box', data_lc)) mma_refs1 = len(re.findall('(mixed martial arts)|(mma)', data_lc)) print(kickboxing_refs1, boxing_refs1, mma_refs1) if kickboxing_refs1 > boxing_refs1 and kickboxing_refs1 > mma_refs1: return kickboxing_key if mma_refs1 > boxing_refs1 and mma_refs1 > kickboxing_refs1: return mma_key if boxing_refs1 > mma_refs1 and boxing_refs1 > kickboxing_refs1: return boxing_key data_lc = clean_text(str(full_text)).lower() kickboxing_refs2 = len(re.findall('kickbox', data_lc)) boxing_refs2 = len(re.findall('[\W.,]+box', data_lc)) mma_refs2 = len(re.findall('(mixed martial arts)|(mma)', data_lc)) print(kickboxing_refs2, boxing_refs2, mma_refs2) if kickboxing_refs2 > boxing_refs2 and kickboxing_refs2 > mma_refs2: return kickboxing_key if mma_refs2 > boxing_refs2 and mma_refs2 > kickboxing_refs2: return mma_key if boxing_refs2 > mma_refs2 and boxing_refs2 > kickboxing_refs2: return boxing_key return other_key
def main(): check_or_create_save_folder(SAVE_FILE_PATH) # List all scraped files related to ApInfo for html_file_path in os.listdir(JOB_FOLDER): # Parse only HTML files if html_file_path.endswith(".html"): job_id = re.findall(r'\d+', html_file_path)[0] json_file_name = "%s-%s.json" % (job_platform, job_id) save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name) # Check if file hasn't already been parsed if not os.path.isfile(save_path): try: htmlfile = open(JOB_FOLDER + "/" + html_file_path) soup = BeautifulSoup(htmlfile.read()) job_info = soup.find("div", class_="info-data").text.strip() date = job_info.rsplit('-', 1)[1].strip() location = clean_text(job_info.rsplit('-', 1)[0]) city = location.rsplit('-', 1)[0].strip() state = location.rsplit('-', 1)[1].strip() job_title = soup.find("div", class_="cargo m-tb").text job_title = clean_apinfo_jobtitle(job_title) company = clean_text( soup.find('div', class_="texto").contents[3].contents[2]) job_description = clean_text( soup.find('div', class_="texto").contents[1].text) data = { 'date': date, 'job_title': job_title, 'company': company, 'location_city': city, 'location_state': state, 'job_description': job_description, 'job_platform': job_platform, 'job_platform_id': job_id } save_json_file(save_path, data) # Log errors to a text file except Exception as e: target = open(ERROR_LOG_FILE, "a") error_details = "" if job_id: error_details += "%s: " % job_id print(job_id) print(e) error_details += str(e) target.write("%s\n" % error_details)
def main(): check_or_create_save_folder(SAVE_FILE_PATH) # List all scraped files related to ApInfo for html_file_path in os.listdir(JOB_FOLDER): # Parse only HTML files if html_file_path.endswith(".html"): job_id = re.findall(r'\d+', html_file_path)[0] json_file_name = "%s-%s.json" % (job_platform, job_id) save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name) # Check if file hasn't already been parsed if not os.path.isfile(save_path): try: htmlfile = open(JOB_FOLDER+"/"+html_file_path) soup = BeautifulSoup(htmlfile.read()) job_info = soup.find("div", class_="info-data").text.strip() date = job_info.rsplit('-', 1)[1].strip() location = clean_text(job_info.rsplit('-', 1)[0]) city = location.rsplit('-', 1)[0].strip() state = location.rsplit('-', 1)[1].strip() job_title = soup.find("div", class_="cargo m-tb").text job_title = clean_apinfo_jobtitle(job_title) company = clean_text(soup.find('div', class_="texto").contents[3].contents[2]) job_description = clean_text(soup.find('div', class_="texto").contents[1].text) data = { 'date': date, 'job_title': job_title, 'company': company, 'location_city': city, 'location_state': state, 'job_description': job_description, 'job_platform': job_platform, 'job_platform_id': job_id } save_json_file(save_path, data) # Log errors to a text file except Exception as e: target = open(ERROR_LOG_FILE, "a") error_details = "" if job_id: error_details += "%s: " % job_id print(job_id) print(e) error_details += str(e) target.write("%s\n" % error_details)
def extract_general_method(s): split_method = str(s).split('(') if len(split_method) >= 2: clean_method = clean_text(split_method[0]) else: clean_method = clean_text(s) if 'tko' in clean_method or 'ko' in clean_method: return 'ko' elif 'submission' in clean_method: return 'submission' elif 'decision' in clean_method: return 'decision' else: return 'other'
def main(): check_or_create_save_folder(SAVE_FILE_PATH) # List all scraped files related to Trampos for json_file_path in os.listdir(JOB_FOLDER): # Parse only JSON files if json_file_path.endswith(".json"): job_id = re.findall(r'\d+', json_file_path)[0] json_file_name = "%s-%s.json" % (job_platform, job_id) save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name) # Check if file hasn't already been parsed if not os.path.isfile(save_path): try: with open("%s/%s" % (JOB_FOLDER, json_file_path)) as json_data: job_data = json.load(json_data)['opportunity'] data = {} data['date'] = get_date(job_data['published_at']) if 'city' in job_data and 'state' in job_data: data['city'] = clean_text(job_data['city']) data['state'] = clean_text(job_data['state']) elif 'home_office' in job_data: data['home_office'] = True data['job_title'] = clean_text(job_data['name']) if 'company' in job_data and job_data['company']: data['company'] = clean_text( job_data['company']['name']) data['job_description'] = "%s %s %s" % ( clean_text(job_data['description']), clean_text(job_data['prerequisite']), clean_text(job_data['desirable'])) data['job_platform'] = job_platform data['job_platform_id'] = job_id save_json_file(save_path, data) # Log errors to a text file except Exception as e: target = open(ERROR_LOG_FILE, "a") error_details = "" if job_id: error_details += "%s: " % job_id print(job_id) print(e) error_details += str(e) target.write("%s\n" % error_details)
def transform(self, documents): documents = [tokenize(d) for d in documents] documents = [d[:self.max_page_size] for d in documents] documents = [' '.join(d) for d in documents] if self.encoding_type in ['tfidf', 'count', 'binary']: return self.vectorizer.transform(documents).toarray() if self.encoding_type == 'lda': documents_tokenized = [tokenize(i) for i in documents] other_corpus = [ self.common_dictionary.doc2bow(i) for i in documents_tokenized ] results = [] for i in other_corpus: result = self.vectorizer[i] result = vectorize_topic_models(result, self.num_of_topics) results.append(result) return np.array(results) if self.encoding_type in ['doc2vec']: documents_tokenized = [tokenize(i) for i in documents] results = [] for i in documents_tokenized: if i: try: results.append(self.vectorizer[i][0]) except KeyError: results.append([0 for _ in range(self.encoding_size)]) else: results.append([0 for _ in range(self.encoding_size)]) return np.array(results) if self.encoding_type in ['fasttext']: documents_clean = [clean_text(i) for i in documents] results = [] for i in documents_clean: if i: results.append(self.vectorizer.get_sentence_vector(i)) # results.append(self.vectorizer[i]) else: results.append( np.array([0 for _ in range(self.encoding_size)])) return np.array(results)
def scrape_fighter(next_url): sections_dict = dict() r = requests.get(next_url) soup = BeautifulSoup(r.text) stats_table_card = soup.find('table', {'class': 'infobox vcard'}) general_stats = get_general_info(stats_table_card) mw_parser_output = soup.find('div', {'class': 'mw-parser-output'}) if mw_parser_output: page_items = mw_parser_output.find_all(['h2', 'h3', 'table']) sections = dict() active_key = None for i in page_items: if i.name == 'h2': active_key = clean_text(i.get_text()).lower() sections[active_key] = [] if active_key: sections[active_key].append(i) for i in sections: if 'mma' in i.lower() or 'boxing' in i.lower() or 'record' in i.lower(): if i not in mma_page_section_names and i not in boxing_page_section_names and i not in ['Possibly missed key: Amateur kickboxing career[edit]', 'Possibly missed key: Professional boxing career[edit]', 'Possibly missed key: Mixed martial arts career[edit]', 'Possibly missed key: Kickboxing record (Incomplete)[edit]']: print('Possibly missed key: {0}'.format(i)) active_key2 = None for c1, i in enumerate(sections): if i in mma_page_section_names: sections_dict.setdefault(mma_key, dict()) sections_dict[mma_key].setdefault(exhibition_key, list()) sections_dict[mma_key].setdefault(pro_key, list()) sections_dict[mma_key].setdefault(amateur_key, list()) active_key1 = mma_key elif i in boxing_page_section_names: sections_dict.setdefault(boxing_key, dict()) sections_dict[boxing_key].setdefault(exhibition_key, list()) sections_dict[boxing_key].setdefault(pro_key, list()) sections_dict[boxing_key].setdefault(amateur_key, list()) active_key1 = boxing_key # elif i in kickboxing_page_section_names: # sections_dict.setdefault(kickboxing_key, dict()) # sections_dict[kickboxing_key].setdefault(exhibition_key, list()) # sections_dict[kickboxing_key].setdefault(pro_key, list()) # sections_dict[kickboxing_key].setdefault(amateur_key, list()) # active_key1 = kickboxing_key elif i in mma_amateur_section: sections_dict.setdefault(mma_key, dict()) sections_dict[mma_key].setdefault(amateur_key, list()) active_key1 = mma_key active_key2 = amateur_key elif i in mma_amateur_section: sections_dict.setdefault(mma_key, dict()) sections_dict[mma_key].setdefault(amateur_key, list()) active_key1 = mma_key active_key2 = amateur_key elif i in mma_pro_section: sections_dict.setdefault(mma_key, dict()) sections_dict[mma_key].setdefault(pro_key, list()) active_key1 = mma_key active_key2 = pro_key elif i in professional_boxing_section_names: sections_dict.setdefault(boxing_key, dict()) sections_dict[boxing_key].setdefault(pro_key, list()) active_key1 = boxing_key active_key2 = pro_key elif i in professional_record_names: sport_type = guess_sport(stats_table_card, r.text) print('guessing sport: {0} {1}'.format(next_url, sport_type)) active_key1 = sport_type sections_dict.setdefault(sport_type, dict()) sections_dict[sport_type].setdefault(amateur_key, list()) sections_dict[sport_type].setdefault(pro_key, list()) sections_dict[sport_type].setdefault(amateur_key, list()) else: continue for c2, j in enumerate(sections[i]): data_type = get_table_type(j) # print(data_type) if data_type == other_key: continue if data_type == data_table_key and c2 < 2: active_key2 = pro_key if data_type in [pro_key, amateur_key, exhibition_key]: active_key2 = data_type if data_type == data_table_key and active_key2: sections_dict[active_key1][active_key2].append(j) active_key2 = None return sections_dict, general_stats
def extract_table(f_url, tables_dict, general_stats): dfs = [] new_urls = [] fighter_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f_url)) sport_keys = tables_dict.keys() for s in sport_keys: types_of_events = tables_dict[s].keys() for t in types_of_events: print(s, t) for j in tables_dict[s][t]: print(1) tr_tags = j.find_all('tr') header_index = get_row_num_of_headers(j) opponent_col_name = [i.get_text().strip() for i in tr_tags[header_index].find_all(['th', 'td']) if 'opponent' in i.get_text().strip().lower()][0] index_of_opponent = [c for c, i in enumerate(tr_tags[header_index].find_all(['th', 'td'])) if opponent_col_name.strip() == i.get_text().strip()][0] id_mapping = dict() for k in tr_tags[header_index + 1:]: opponent_cell = k.find_all('td')[index_of_opponent] opponent_a_tag = opponent_cell.find_all('a') opponent_name = opponent_cell.get_text().strip() opponent_rel_links = [k2['href'] for k2 in opponent_a_tag if k2['href'] not in links_to_avoid] if opponent_rel_links: opponent_rel_link = opponent_rel_links[-1] opponent_abs_link = urljoin(base_url, opponent_rel_link) new_urls.append(opponent_abs_link) opponent_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, opponent_abs_link)) has_wiki = 1 else: opponent_id = str(uuid.uuid4()) has_wiki = 0 id_mapping[opponent_name] = {'id': opponent_id, 'has_wiki': has_wiki} df = pd.read_html(str(j), header=header_index)[0] df['fighter_id'] = fighter_id df['opponent_id'] = df.apply(lambda x: id_mapping.get(x[opponent_col_name], {'id':str(uuid.uuid4())})['id'], axis = 1) df['opponent_has_wiki'] = df.apply(lambda x: id_mapping.get(x[opponent_col_name], {'has_wiki':0})['has_wiki'], axis = 1) df['sport'] = s df['event_type'] = t for g in general_stats: df[g] = general_stats[g] df = df.applymap(lambda x: clean_text(x)) dfs.append(df) if dfs: df = pd.concat(dfs) else: df = pd.DataFrame() return df, new_urls
def main(): check_or_create_save_folder(SAVE_FILE_PATH) # List all scraped files related to Ceviu for html_file_path in os.listdir(JOB_FOLDER): # Parse only HTML files if html_file_path.endswith(".html"): job_id = re.findall(r'\d+', html_file_path)[0] json_file_name = "%s-%s.json" % (job_platform, job_id) save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name) # Check if file hasn't already been parsed if not os.path.isfile(save_path): try: htmlfile = open(JOB_FOLDER + "/" + html_file_path) soup = BeautifulSoup(htmlfile.read()) job_info = soup.find( "p", class_="codigo-data-vaga").text.strip() date = job_info.rsplit('Data: ', 1)[1].strip() location = soup.find('div', class_="localizacao-vaga").text location = clean_text(location) city = re.search('Localizacao (.*)/', location, re.IGNORECASE).group(1) state = re.search('/(.*)', location, re.IGNORECASE).group(1) job_title = soup.find("h2", class_="titulo-vaga").text job_title = clean_text(job_title) company = None if soup.find('a', class_="nome-empresa"): company = soup.find('a', class_="nome-empresa").text company = clean_text(company) job_description = soup.find('div', class_='descricao-vaga').text job_description = clean_text(job_description) job_description = re.sub("Descricao da vaga ", "", job_description) job_description = re.sub("Vaga Patrocinada ", "", job_description) data = { 'date': date, 'job_title': job_title, 'location_city': city, 'location_state': state, 'job_description': job_description, 'job_platform': job_platform, 'job_platform_id': job_id } if company: data['company'] = company save_json_file(save_path, data) # Log errors to a text file except Exception as e: target = open(ERROR_LOG_FILE, "a") error_details = "" if job_id: error_details += "%s: " % job_id print(job_id) print(e) error_details += str(e) target.write("%s\n" % error_details)
def clean_name(s): s_split = str(s).split('"') if len(s_split) >= 3: return clean_text(s_split[0] + s_split[-1]) return clean_text(s)
def extract_details(s): split_method = str(s).split('(') if len(split_method) >= 2: clean_method = clean_text(' '.join([i for i in split_method[1:]])) return clean_method
def extract_method(s): split_method = str(s).split('(') if len(split_method) >= 2: clean_method = clean_text(split_method[0]) return clean_method
def main(): check_or_create_save_folder(SAVE_FILE_PATH) # List all scraped files related to Ceviu for html_file_path in os.listdir(JOB_FOLDER): # Parse only HTML files if html_file_path.endswith(".html"): job_id = re.findall(r'\d+', html_file_path)[0] json_file_name = "%s-%s.json" % (job_platform, job_id) save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name) # Check if file hasn't already been parsed if not os.path.isfile(save_path): try: htmlfile = open(JOB_FOLDER+"/"+html_file_path) soup = BeautifulSoup(htmlfile.read()) job_info = soup.find("p", class_="codigo-data-vaga").text.strip() date = job_info.rsplit('Data: ', 1)[1].strip() location = soup.find('div', class_="localizacao-vaga").text location = clean_text(location) city = re.search('Localizacao (.*)/', location, re.IGNORECASE).group(1) state = re.search('/(.*)', location, re.IGNORECASE).group(1) job_title = soup.find("h2", class_="titulo-vaga").text job_title = clean_text(job_title) company = None if soup.find('a', class_="nome-empresa"): company = soup.find('a', class_="nome-empresa").text company = clean_text(company) job_description = soup.find('div', class_='descricao-vaga').text job_description = clean_text(job_description) job_description = re.sub("Descricao da vaga ", "", job_description) job_description = re.sub("Vaga Patrocinada ", "", job_description) data = { 'date': date, 'job_title': job_title, 'location_city': city, 'location_state': state, 'job_description': job_description, 'job_platform': job_platform, 'job_platform_id': job_id } if company: data['company'] = company save_json_file(save_path, data) # Log errors to a text file except Exception as e: target = open(ERROR_LOG_FILE, "a") error_details = "" if job_id: error_details += "%s: " % job_id print(job_id) print(e) error_details += str(e) target.write("%s\n" % error_details)
def main(): check_or_create_save_folder(SAVE_FILE_PATH) # List all scraped files related to NetCarreiras for html_file_path in os.listdir(JOB_FOLDER): # Parse only HTML files if html_file_path.endswith(".html"): job_id = re.findall(r'\d+', html_file_path)[0] json_file_name = "%s-%s.json" % (job_platform, job_id) save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name) # Check if file hasn't already been parsed if not os.path.isfile(save_path): try: htmlfile = open("%s/%s" % (JOB_FOLDER, html_file_path)) soup = BeautifulSoup(htmlfile.read()) date = soup.find('div', class_="profile").contents[3].text date = clean_text(date) location = soup.find('div', {'id': "location"}).text location = clean_text(location) city = re.search('(.*) -', location, re.IGNORECASE).group(1) state = re.search('- (.*) \(', location, re.IGNORECASE).group(1) job_title = soup.find("h1").text job_title = clean_text(job_title) company = None if soup.find_all('a', href=re.compile('^vagas-na-(.*)')): company = soup.find_all('a', href=re.compile('^vagas-na-(.*)'))[0].text company = clean_text(company) job_description = soup.find('article').contents[11].text job_description = clean_text(job_description) data = { 'date': date, 'job_title': job_title, 'location_city': city, 'location_state': state, 'job_description': job_description, 'job_platform': job_platform, 'job_platform_id': job_id } if company: data['company'] = company save_json_file(save_path, data) # Log errors to a text file except Exception as e: target = open(ERROR_LOG_FILE, "a") error_details = "" if job_id: error_details += "%s: " % job_id print(job_id) print(e) error_details += str(e) target.write("%s\n" % error_details)
def fit(self, documents): documents = [tokenize(d) for d in documents] documents = [d[:self.max_page_size] for d in documents] documents = [' '.join(d) for d in documents] if self.encoding_type in ['tfidf', 'count', 'binary']: if self.encoding_type == 'tfidf': self.vectorizer = CountVectorizer( ngram_range=(self.min_n_gram, self.max_n_gram), max_features=self.max_vocab_size, binary=False, max_df=self.max_df, analyzer=self.tokenizer_level) self.vectorizer.fit(documents) if self.encoding_type == 'count': self.vectorizer = CountVectorizer( ngram_range=(self.min_n_gram, self.max_n_gram), max_features=self.max_vocab_size, binary=False, max_df=self.max_df, analyzer=self.tokenizer_level) self.vectorizer.fit(documents) if self.encoding_type == 'binary': self.vectorizer = CountVectorizer( ngram_range=(self.min_n_gram, self.max_n_gram), max_features=self.max_vocab_size, binary=False, max_df=self.max_df, analyzer=self.tokenizer_level) self.vectorizer.fit(documents) with open(self.save_file_loc, 'wb') as f: pickle.dump(self.vectorizer, f) if self.encoding_type == 'lda': documents_tokenized = [tokenize(i) for i in documents] self.common_dictionary = Dictionary(documents_tokenized) common_corpus = [ self.common_dictionary.doc2bow(text) for text in documents_tokenized ] self.vectorizer = ldamodel.LdaModel(common_corpus, id2word=self.common_dictionary, num_topics=self.num_of_topics, passes=self.vectorizer_epochs) self.vectorizer.save(self.save_file_loc) if self.encoding_type == 'doc2vec': tagged_documents = [ TaggedDocument(tokenize(doc), [i]) for i, doc in enumerate(documents) ] self.vectorizer = Doc2Vec(tagged_documents, vector_size=self.encoding_size, window=2, min_count=1, workers=4, epochs=self.vectorizer_epochs, max_vocab_size=100000) self.vectorizer.delete_temporary_training_data( keep_doctags_vectors=True, keep_inference=True) self.vectorizer.save(self.save_file_loc) if self.encoding_type == 'fasttext': with open(self.fasttext_training_file_location, 'w') as f: for i in documents: f.write(clean_text(i) + '\n') self.vectorizer = fasttext.train_unsupervised( self.fasttext_training_file_location, model=self.fasttext_algorithm, dim=self.encoding_size) self.vectorizer.save_model(self.save_file_loc)