def cron_linkedin_jobs(): fr = dbg.Function(inspect.currentframe()).report_init() ############################################################ search_keywords = [ 'Data Analytics', 'Data Analysis', 'Data Scientist', 'Data Science', 'Data Engineer', 'Machine Learning', 'Artificial Intelligence (AI)', 'Natural Language Processing', 'Business Intelligence (bi)', 'Python', 'Node.js', ] lid = linkedin.LinkedInDriver(driver) lid.login().setup_services() lid.jobs.collect_on_keywords(location='Spain', duration=0) lid.jobs.collect_on_keywords(location='United Kingdom', duration=0) ############################################################ dup = linkedin.jobs.Deduplicator() dup.load_targets().delete_dup_data() ############################################################ linkedin.jobs.parse() ############################################################ fr.report_fin()
def minutes_job(): """뉴스페이지 스냅샷 수집, 파싱""" print(f"\n {inspect.stack()[0][3]} :\n{inspect.getdoc(minutes_job)}\n") fr = dbg.Function(inspect.currentframe()).report_init() snapshots.collect() snapshots.parse() fr.report_fin()
def hours_job(): """뉴스_수집, 뉴스_파싱""" print(f"\n {inspect.stack()[0][3]} :\n{inspect.getdoc(hours_job)}\n") fr = dbg.Function(inspect.currentframe()).report_init() articles.collect() articles.parse() fr.report_fin()
def collect_etri_analysis(pressname='네이버', targetcol='bodytext', techname='LangAnalysis', apicode='srl'): fr = dbg.Function(inspect.currentframe()).report_init() etric = ETRIAIAnalysisCollector(pressname, targetcol, techname, apicode) etric.collect() fr.report_fin()
def clustering(self): """이미 분류된 traindf가 없기 때문에 별도로 클러스터링 해야 한다.""" fr = dbg.Function(inspect.currentframe()).report_init() self.X_train_counts = self.vectorizer.fit_transform(list(self.traindf.noun_txt)) self.X_train_tfidf = self.tfidf_transformer.fit_transform(self.X_train_counts) cluster = self.cluster.fit(self.X_train_tfidf.toarray()) self.traindf['label'] = cluster.labels_ fr.report_fin() return self
def get_targets(self): fr = dbg.Function(inspect.currentframe()).report_init() """기존 ETRIAIAnalysis 완료된 docids 로딩.""" docids = self.distinct('docid') """Article-모델에서 분석할 타겟 로딩.""" filter = {'_id': {'$nin': docids}, self.targetcol: {'$ne': None}} projection = {'_id': 1, self.targetcol: 1} self.article.load(filter, projection) fr.report_fin() return self
def load_results(targetmodel, targetcol, techname, apicode, method, type_regex): """ ************************************************************ <class 'idebug.performance.Function'> | report_init caller : <ipython-input-6-2498b8afc3e9> | load_results visible_inputs : {'targetmodel': 'Article__네이버', 'targetcol': 'bodytext', 'techname': 'LangAnalysis', 'apicode': 'srl', 'docid': ObjectId('5c9bfe94639bfb5340fc6b67'), 'method': 'morp', 'type_regex': '^NN[GP]'} ************************************************************ <class 'idebug.performance.Function'> | report_mid caller : <ipython-input-6-2498b8afc3e9> | load_results | etri.load() 에 걸린시간. start_dt : 2019-04-29 23:27:10.463422+02:00 end_dt : 2019-04-29 23:32:16.106041+02:00 interval_runtime : 5.1_[mins] ************************************************************ <class 'idebug.performance.Function'> | report_mid caller : <ipython-input-6-2498b8afc3e9> | load_results | json_normalize() 에 걸린시간. start_dt : 2019-04-29 23:32:16.106041+02:00 end_dt : 2019-04-29 23:38:01.344299+02:00 interval_runtime : 5.8_[mins] ************************************************************ <class 'idebug.performance.Function'> | report_mid caller : <ipython-input-6-2498b8afc3e9> | load_results | new_df 변환에 걸린시간. start_dt : 2019-04-29 23:38:01.344299+02:00 end_dt : 2019-04-29 23:38:12.595281+02:00 interval_runtime : 11.3_[secs] ************************************************************ <class 'idebug.performance.Function'> | report_fin caller : <ipython-input-6-2498b8afc3e9> | load_results | None start_dt : 2019-04-29 23:27:10.463422+02:00 end_dt : 2019-04-29 23:38:12.595390+02:00 runtime : 11.0_[mins] """ fr = dbg.Function(inspect.currentframe()).report_init() etri = models.ETRIAI(targetmodel, targetcol, techname, apicode) etri.load({}, {'_id': 0, 'docid': 1, 'results': 1}) fr.report_mid(addi_info=" etri.load() 에 걸린시간.") df = etri.get_df() df = json_normalize(etri.docs, 'results', ['docid']) df = json_normalize(df.to_dict('records'), 'sentence', ['docid']) df = json_normalize(df.to_dict('records'), method, ['docid']) df = df[df.type.str.contains(pat=type_regex)] fr.report_mid(addi_info=" json_normalize() 에 걸린시간.") nouns = [] for n, g in df.groupby('docid'): nouns.append({'docid': n, 'noun_txt': " ".join(list(g.lemma))}) df = pd.DataFrame(nouns) fr.report_mid(addi_info=" new_df 변환에 걸린시간.") fr.report_fin() return df
def collect_1_keyword(keyword='data analytics', location='Spain', duration=1, pagination_sleep=3): fr = dbg.Function(inspect.currentframe()).report_init() c = Collector() c.move_to_job_search_page() c.set_searching(keyword, location).set_date(duration).set_sorting('date') c.extract_keyword_location() c.loop_pagination(sleepsecs=pagination_sleep) fr.report_fin()
def collect_on_keywords(self, search_keywords, location='Spain', duration=0): fr = dbg.Function(inspect.currentframe()).report_init() ############################################################ fi = FunctionIterator(search_keywords, self.collect_on_keyword) while fi.iterable: fi.nextop() ############################################################ fr.report_fin()
def jnload(self, filter={}): fr = dbg.Function(inspect.currentframe()).report_init() filter.update({'layout': {'$ne': None}}) self.load(filter, {'html': 0}) fr.report_fin() if len(self.docs) is 0: print("\n len(self.docs) is 0.\n") else: return json_normalize( self.docs, 'layout', ['_id', 'snapshot_dt']).rename(columns={'_id': 'snapshotID'})
def days_job(): """ETRI언어분석_수집""" print(f"\n {inspect.stack()[0][3]} :\n{inspect.getdoc(days_job)}\n") fr = dbg.Function(inspect.currentframe()).report_init() articles.collect_etri_analysis(pressname='네이버', targetcol='headline', techname='LangAnalysis', apicode='srl') articles.collect_etri_analysis(pressname='네이버', targetcol='bodytext', techname='LangAnalysis', apicode='srl') fr.report_fin()
def collect_on_keyword(self, keyword='machine learning', location='Spain', duration=0): fr = dbg.Function(inspect.currentframe()).report_init() ############################################################ self.move_to_job_search_page() self.put_searching_keyword( keyword, location).choose_posted_duration(duration).set_sorting('date') ############################################################ self.extract_keyword_location() self.loop_pagination() ############################################################ fr.report_fin()
def collect(pressname=None, pagename=None): fr = dbg.Function(inspect.currentframe()).report_init() filter = {} if isinstance(pressname, str) and isinstance(pagename, str): filter.update({'pressname': pressname, 'name': pagename}) page = models.NewsPage().load(filter) loop = dbg.Loop( f"{sys.modules[__name__].__file__} | {inspect.stack()[0][3]}", len(page.docs)) for d in page.docs: page.attributize(d) c = Collector(page.pressname, page.name) c.collect() loop.report( addi_info=f" pressname : {page.pressname}, pagename : {page.name}") fr.report_fin()
def deduplicate(self): func = dbg.Function(inspect.currentframe()).report_init() df = self.load().get_df() func.report_fin(addi_info=" self.load().get_df()에 걸린시간.") if len(df) is 0: print("\n len(df) is 0.") else: TF = df.sort_values(['url', 'snapshot_dt']).duplicated(keep='first', subset=['url']) df1 = df[TF] if len(df1) is 0: print("\n len(duplicated_df) is 0.") else: self.delete_many({'_id': {'$in': list(df1)}}) func.report_fin()
def tokenize(self): fr = dbg.Function(inspect.currentframe()).report_init() method = 'morp' type_regex = '^NN[GP]' self.load(None, {'_id': 1, self.targetcol: 1}) fr.report_fin() #fr.report_fin(addi_info=" self.load(None,{'_id':1, self.targetcol:1}) 에 걸린시간.") for d in self.docs: self.attributize(d) df = etri.load_result(self.submodel, self.targetcol, self.techname, self.apicode, self._id, method, type_regex) if df is None: print("\n df is None.") else: d['tokens'] = list(df.lemma) fr.report_fin() return self
def collect_on_keyword(keyword='machine learning', location='Spain', duration=0, pagination_sleep=3): fr = dbg.Function(inspect.currentframe()).report_init() nextpage_click_secs = pagination_sleep jobcard_click_secs = 3 job_details_human_reading_secs = 20 c = Collector(nextpage_click_secs, jobcard_click_secs, job_details_human_reading_secs) c.move_to_job_search_page() c.put_searching_keyword( keyword, location).choose_posted_duration(duration).set_sorting('date') c.extract_keyword_location() c.loop_pagination() fr.report_fin()
def collect_on_keyword(self, keyword='machine learning', location='Spain', duration=0): fr = dbg.Function(inspect.currentframe()).report_init() ############################################################ self.collect_dt = datetime.now().astimezone() self.move_to_job_search_page() self.put_searching_keyword( keyword, location).choose_posted_duration(duration).set_sorting('date') self.extract_keyword_location() ############################################################ if self.is_readyto_collect: self.iter_jobcards() else: pass ############################################################ fr.report_fin()
def collect_pages(keyword='data analytics', location='Spain', duration=3, sleepsecs=2): fr = dbg.Function(inspect.currentframe()).report_init() sc = SearchConditionSetter() sc.set_searching(keyword, location).set_date(duration).set_sorting('date') """last page-num, current page-num 찾기.""" pagination = driver.find_element_by_class_name( 'search-results-pagination-section') pbuttons = pagination.find_elements_by_tag_name('li') print(f"\n pagination_count : {len(pbuttons)}\n") if len(pbuttons) is 0: collect_1page() else: last_pagenum = int(pbuttons[-1].find_element_by_tag_name('span').text) print(f" last_pagenum : {last_pagenum}") cur_pbutton = pagination.find_element_by_xpath( '//li[contains(@class, "active selected")]') cur_pagenum = int(cur_pbutton.find_element_by_tag_name('span').text) paginate(cur_pagenum, last_pagenum, sleepsecs)
def collect_1page(sleepsecs=10): """url 하나당 25개의 Job-postings 를 수집/파싱한다. 리스트를 반복하며 Job Posting 상세내용을 수집-분석. """ fr = dbg.Function(inspect.currentframe()).report_init() uo = urlparse(driver.current_url) if ('keywords' in uo.query) and ('location' in uo.query): # jobcards = driver.find_elements_by_class_name('occludable-update') jobcards = driver.find_elements_by_class_name('artdeco-list__item') collect_dt = datetime.today().astimezone() loop = dbg.Loop(f"{inspect.stack()[0][3]} | jobcard-index progress", len(jobcards)) for i, jobcard in enumerate(jobcards): time.sleep(sleepsecs) ############################################################ jobcard_job_details(jobcard) ############################################################ loop.report() fr.report_fin() else: print( f"\n keywords와 locations이 url에 없다.\n driver.current_url : {driver.current_url}" ) time.sleep(sleepsecs)
def classify(self, algorithm): fr = dbg.Function(inspect.currentframe()).report_init() traindf = self.traindf.copy() if algorithm == 'MultinomialNB': self.clf = Pipeline([ ('vect', self.vectorizer), ('tfidf', self.tfidf_transformer), ('clf', MultinomialNB()), ]) self.clf.fit(list(traindf.noun_txt), list(traindf.label)) elif algorithm == 'SGDClassifier': self.clf = Pipeline([ ('vect', self.vectorizer), ('tfidf', self.tfidf_transformer), ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)), ]) self.clf.fit(list(traindf.noun_txt), list(traindf.label)) else: self.clf = None if self.clf is not None: self.predicted = self.clf.predict(list(self.testdf.noun_txt)) self.testdf['predicted'] = self.predicted fr.report_fin() return self
def parse(): fr = dbg.Function(inspect.currentframe()).report_init() press.naver.parse_articles()