Пример #1
0
def cron_linkedin_jobs():
    fr = dbg.Function(inspect.currentframe()).report_init()
    ############################################################
    search_keywords = [
        'Data Analytics',
        'Data Analysis',
        'Data Scientist',
        'Data Science',
        'Data Engineer',
        'Machine Learning',
        'Artificial Intelligence (AI)',
        'Natural Language Processing',
        'Business Intelligence (bi)',
        'Python',
        'Node.js',
    ]
    lid = linkedin.LinkedInDriver(driver)
    lid.login().setup_services()
    lid.jobs.collect_on_keywords(location='Spain', duration=0)
    lid.jobs.collect_on_keywords(location='United Kingdom', duration=0)
    ############################################################
    dup = linkedin.jobs.Deduplicator()
    dup.load_targets().delete_dup_data()
    ############################################################
    linkedin.jobs.parse()
    ############################################################
    fr.report_fin()
Пример #2
0
def minutes_job():
    """뉴스페이지 스냅샷 수집, 파싱"""
    print(f"\n {inspect.stack()[0][3]} :\n{inspect.getdoc(minutes_job)}\n")
    fr = dbg.Function(inspect.currentframe()).report_init()
    snapshots.collect()
    snapshots.parse()
    fr.report_fin()
Пример #3
0
def hours_job():
    """뉴스_수집, 뉴스_파싱"""
    print(f"\n {inspect.stack()[0][3]} :\n{inspect.getdoc(hours_job)}\n")
    fr = dbg.Function(inspect.currentframe()).report_init()
    articles.collect()
    articles.parse()
    fr.report_fin()
Пример #4
0
def collect_etri_analysis(pressname='네이버',
                          targetcol='bodytext',
                          techname='LangAnalysis',
                          apicode='srl'):
    fr = dbg.Function(inspect.currentframe()).report_init()
    etric = ETRIAIAnalysisCollector(pressname, targetcol, techname, apicode)
    etric.collect()
    fr.report_fin()
Пример #5
0
 def clustering(self):
     """이미 분류된 traindf가 없기 때문에 별도로 클러스터링 해야 한다."""
     fr = dbg.Function(inspect.currentframe()).report_init()
     self.X_train_counts = self.vectorizer.fit_transform(list(self.traindf.noun_txt))
     self.X_train_tfidf = self.tfidf_transformer.fit_transform(self.X_train_counts)
     cluster = self.cluster.fit(self.X_train_tfidf.toarray())
     self.traindf['label'] = cluster.labels_
     fr.report_fin()
     return self
Пример #6
0
 def get_targets(self):
     fr = dbg.Function(inspect.currentframe()).report_init()
     """기존 ETRIAIAnalysis 완료된 docids 로딩."""
     docids = self.distinct('docid')
     """Article-모델에서 분석할 타겟 로딩."""
     filter = {'_id': {'$nin': docids}, self.targetcol: {'$ne': None}}
     projection = {'_id': 1, self.targetcol: 1}
     self.article.load(filter, projection)
     fr.report_fin()
     return self
Пример #7
0
def load_results(targetmodel, targetcol, techname, apicode, method,
                 type_regex):
    """
    ************************************************************
     <class 'idebug.performance.Function'> | report_init
     caller : <ipython-input-6-2498b8afc3e9> | load_results
     visible_inputs : {'targetmodel': 'Article__네이버', 'targetcol': 'bodytext', 'techname': 'LangAnalysis', 'apicode': 'srl', 'docid': ObjectId('5c9bfe94639bfb5340fc6b67'), 'method': 'morp', 'type_regex': '^NN[GP]'}

    ************************************************************
     <class 'idebug.performance.Function'> | report_mid
     caller : <ipython-input-6-2498b8afc3e9> | load_results |  etri.load() 에 걸린시간.
     start_dt : 2019-04-29 23:27:10.463422+02:00
     end_dt : 2019-04-29 23:32:16.106041+02:00
     interval_runtime : 5.1_[mins]

    ************************************************************
     <class 'idebug.performance.Function'> | report_mid
     caller : <ipython-input-6-2498b8afc3e9> | load_results |  json_normalize() 에 걸린시간.
     start_dt : 2019-04-29 23:32:16.106041+02:00
     end_dt : 2019-04-29 23:38:01.344299+02:00
     interval_runtime : 5.8_[mins]

    ************************************************************
     <class 'idebug.performance.Function'> | report_mid
     caller : <ipython-input-6-2498b8afc3e9> | load_results |  new_df 변환에 걸린시간.
     start_dt : 2019-04-29 23:38:01.344299+02:00
     end_dt : 2019-04-29 23:38:12.595281+02:00
     interval_runtime : 11.3_[secs]

    ************************************************************
     <class 'idebug.performance.Function'> | report_fin
     caller : <ipython-input-6-2498b8afc3e9> | load_results | None
     start_dt : 2019-04-29 23:27:10.463422+02:00
     end_dt : 2019-04-29 23:38:12.595390+02:00
     runtime : 11.0_[mins]
    """
    fr = dbg.Function(inspect.currentframe()).report_init()
    etri = models.ETRIAI(targetmodel, targetcol, techname, apicode)
    etri.load({}, {'_id': 0, 'docid': 1, 'results': 1})
    fr.report_mid(addi_info=" etri.load() 에 걸린시간.")
    df = etri.get_df()
    df = json_normalize(etri.docs, 'results', ['docid'])
    df = json_normalize(df.to_dict('records'), 'sentence', ['docid'])
    df = json_normalize(df.to_dict('records'), method, ['docid'])
    df = df[df.type.str.contains(pat=type_regex)]
    fr.report_mid(addi_info=" json_normalize() 에 걸린시간.")

    nouns = []
    for n, g in df.groupby('docid'):
        nouns.append({'docid': n, 'noun_txt': " ".join(list(g.lemma))})
    df = pd.DataFrame(nouns)
    fr.report_mid(addi_info=" new_df 변환에 걸린시간.")
    fr.report_fin()
    return df
Пример #8
0
def collect_1_keyword(keyword='data analytics',
                      location='Spain',
                      duration=1,
                      pagination_sleep=3):
    fr = dbg.Function(inspect.currentframe()).report_init()
    c = Collector()
    c.move_to_job_search_page()
    c.set_searching(keyword, location).set_date(duration).set_sorting('date')
    c.extract_keyword_location()
    c.loop_pagination(sleepsecs=pagination_sleep)
    fr.report_fin()
Пример #9
0
 def collect_on_keywords(self,
                         search_keywords,
                         location='Spain',
                         duration=0):
     fr = dbg.Function(inspect.currentframe()).report_init()
     ############################################################
     fi = FunctionIterator(search_keywords, self.collect_on_keyword)
     while fi.iterable:
         fi.nextop()
     ############################################################
     fr.report_fin()
Пример #10
0
 def jnload(self, filter={}):
     fr = dbg.Function(inspect.currentframe()).report_init()
     filter.update({'layout': {'$ne': None}})
     self.load(filter, {'html': 0})
     fr.report_fin()
     if len(self.docs) is 0:
         print("\n len(self.docs) is 0.\n")
     else:
         return json_normalize(
             self.docs, 'layout',
             ['_id', 'snapshot_dt']).rename(columns={'_id': 'snapshotID'})
Пример #11
0
def days_job():
    """ETRI언어분석_수집"""
    print(f"\n {inspect.stack()[0][3]} :\n{inspect.getdoc(days_job)}\n")
    fr = dbg.Function(inspect.currentframe()).report_init()
    articles.collect_etri_analysis(pressname='네이버',
                                   targetcol='headline',
                                   techname='LangAnalysis',
                                   apicode='srl')
    articles.collect_etri_analysis(pressname='네이버',
                                   targetcol='bodytext',
                                   techname='LangAnalysis',
                                   apicode='srl')
    fr.report_fin()
Пример #12
0
 def collect_on_keyword(self,
                        keyword='machine learning',
                        location='Spain',
                        duration=0):
     fr = dbg.Function(inspect.currentframe()).report_init()
     ############################################################
     self.move_to_job_search_page()
     self.put_searching_keyword(
         keyword,
         location).choose_posted_duration(duration).set_sorting('date')
     ############################################################
     self.extract_keyword_location()
     self.loop_pagination()
     ############################################################
     fr.report_fin()
Пример #13
0
def collect(pressname=None, pagename=None):
    fr = dbg.Function(inspect.currentframe()).report_init()
    filter = {}
    if isinstance(pressname, str) and isinstance(pagename, str):
        filter.update({'pressname': pressname, 'name': pagename})
    page = models.NewsPage().load(filter)
    loop = dbg.Loop(
        f"{sys.modules[__name__].__file__} | {inspect.stack()[0][3]}",
        len(page.docs))
    for d in page.docs:
        page.attributize(d)
        c = Collector(page.pressname, page.name)
        c.collect()
        loop.report(
            addi_info=f" pressname : {page.pressname}, pagename : {page.name}")
    fr.report_fin()
Пример #14
0
 def deduplicate(self):
     func = dbg.Function(inspect.currentframe()).report_init()
     df = self.load().get_df()
     func.report_fin(addi_info=" self.load().get_df()에 걸린시간.")
     if len(df) is 0:
         print("\n len(df) is 0.")
     else:
         TF = df.sort_values(['url',
                              'snapshot_dt']).duplicated(keep='first',
                                                         subset=['url'])
         df1 = df[TF]
         if len(df1) is 0:
             print("\n len(duplicated_df) is 0.")
         else:
             self.delete_many({'_id': {'$in': list(df1)}})
         func.report_fin()
Пример #15
0
 def tokenize(self):
     fr = dbg.Function(inspect.currentframe()).report_init()
     method = 'morp'
     type_regex = '^NN[GP]'
     self.load(None, {'_id': 1, self.targetcol: 1})
     fr.report_fin()
     #fr.report_fin(addi_info=" self.load(None,{'_id':1, self.targetcol:1}) 에 걸린시간.")
     for d in self.docs:
         self.attributize(d)
         df = etri.load_result(self.submodel, self.targetcol, self.techname,
                               self.apicode, self._id, method, type_regex)
         if df is None:
             print("\n df is None.")
         else:
             d['tokens'] = list(df.lemma)
     fr.report_fin()
     return self
Пример #16
0
 def collect_on_keyword(keyword='machine learning',
                        location='Spain',
                        duration=0,
                        pagination_sleep=3):
     fr = dbg.Function(inspect.currentframe()).report_init()
     nextpage_click_secs = pagination_sleep
     jobcard_click_secs = 3
     job_details_human_reading_secs = 20
     c = Collector(nextpage_click_secs, jobcard_click_secs,
                   job_details_human_reading_secs)
     c.move_to_job_search_page()
     c.put_searching_keyword(
         keyword,
         location).choose_posted_duration(duration).set_sorting('date')
     c.extract_keyword_location()
     c.loop_pagination()
     fr.report_fin()
Пример #17
0
 def collect_on_keyword(self,
                        keyword='machine learning',
                        location='Spain',
                        duration=0):
     fr = dbg.Function(inspect.currentframe()).report_init()
     ############################################################
     self.collect_dt = datetime.now().astimezone()
     self.move_to_job_search_page()
     self.put_searching_keyword(
         keyword,
         location).choose_posted_duration(duration).set_sorting('date')
     self.extract_keyword_location()
     ############################################################
     if self.is_readyto_collect:
         self.iter_jobcards()
     else:
         pass
     ############################################################
     fr.report_fin()
Пример #18
0
def collect_pages(keyword='data analytics',
                  location='Spain',
                  duration=3,
                  sleepsecs=2):
    fr = dbg.Function(inspect.currentframe()).report_init()
    sc = SearchConditionSetter()
    sc.set_searching(keyword, location).set_date(duration).set_sorting('date')
    """last page-num, current page-num 찾기."""
    pagination = driver.find_element_by_class_name(
        'search-results-pagination-section')
    pbuttons = pagination.find_elements_by_tag_name('li')
    print(f"\n pagination_count : {len(pbuttons)}\n")
    if len(pbuttons) is 0:
        collect_1page()
    else:
        last_pagenum = int(pbuttons[-1].find_element_by_tag_name('span').text)
        print(f" last_pagenum : {last_pagenum}")
        cur_pbutton = pagination.find_element_by_xpath(
            '//li[contains(@class, "active selected")]')
        cur_pagenum = int(cur_pbutton.find_element_by_tag_name('span').text)
        paginate(cur_pagenum, last_pagenum, sleepsecs)
Пример #19
0
def collect_1page(sleepsecs=10):
    """url 하나당 25개의 Job-postings 를 수집/파싱한다.
    리스트를 반복하며 Job Posting 상세내용을 수집-분석.
    """
    fr = dbg.Function(inspect.currentframe()).report_init()
    uo = urlparse(driver.current_url)
    if ('keywords' in uo.query) and ('location' in uo.query):
        # jobcards = driver.find_elements_by_class_name('occludable-update')
        jobcards = driver.find_elements_by_class_name('artdeco-list__item')
        collect_dt = datetime.today().astimezone()
        loop = dbg.Loop(f"{inspect.stack()[0][3]} | jobcard-index progress",
                        len(jobcards))
        for i, jobcard in enumerate(jobcards):
            time.sleep(sleepsecs)
            ############################################################
            jobcard_job_details(jobcard)
            ############################################################
            loop.report()
        fr.report_fin()
    else:
        print(
            f"\n keywords와 locations이 url에 없다.\n driver.current_url : {driver.current_url}"
        )
        time.sleep(sleepsecs)
Пример #20
0
 def classify(self, algorithm):
     fr = dbg.Function(inspect.currentframe()).report_init()
     traindf = self.traindf.copy()
     if algorithm == 'MultinomialNB':
         self.clf = Pipeline([
             ('vect', self.vectorizer),
             ('tfidf', self.tfidf_transformer),
             ('clf', MultinomialNB()),
         ])
         self.clf.fit(list(traindf.noun_txt), list(traindf.label))
     elif algorithm == 'SGDClassifier':
         self.clf = Pipeline([
             ('vect', self.vectorizer),
             ('tfidf', self.tfidf_transformer),
             ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)),
         ])
         self.clf.fit(list(traindf.noun_txt), list(traindf.label))
     else:
         self.clf = None
     if self.clf is not None:
         self.predicted = self.clf.predict(list(self.testdf.noun_txt))
         self.testdf['predicted'] = self.predicted
     fr.report_fin()
     return self
Пример #21
0
def parse():
    fr = dbg.Function(inspect.currentframe()).report_init()
    press.naver.parse_articles()