def gain_data(self, query, begin=None, end=None, language=None, start=0, nums=0, pause=2): """first get articles count, then loop pages""" init_url = self.req_url(query, begin, end, language, start, pause=2) # bsObj = self.Cold_boot(init_url) # TotalCount = self.counts_result(bsObj, start) pages = int(ceil(nums / 10)) page = 0 Allinformations = [] while page <= pages: print(page) start = page * 10 url = self.req_url(query, begin, end, language, start, pause=pause) ylog.debug(url) bsObj = self.Cold_boot(url) info = self.extract(bsObj) # print(type(bsObj)) #info = self.content(bsObj) if len(info) == 0: break Allinformations = Allinformations + info page = page + 1 infos = { # 'TotalCount': TotalCount, 'QueryURL': init_url, 'Allinformations': Allinformations } return infos
def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = 'HTTP response data follow' if err_msg is None: err_msg = 'request failed' try: ScholarUtils.log('info', 'requesting %s' % unquote(url)) req = Request( url=url, proxies=self.proxies, headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) html = hdl.read() ylog.debug(log_msg) ScholarUtils.log('debug', log_msg) ScholarUtils.log('debug', '>>>>' + '-' * 68) ScholarUtils.log('debug', 'url: %s' % hdl.geturl()) ScholarUtils.log('debug', 'result: %s' % hdl.getcode()) ScholarUtils.log('debug', 'headers:\n' + str(hdl.info())) ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 ScholarUtils.log('debug', '<<<<' + '-' * 68) return html except Exception as err: ScholarUtils.log('info', err_msg + ': %s' % err) return None
def search_relation(self, bsObj, pause=2): # RelatedKw = [] pq_content = self.pq_html(bsObj) # ylog.info(pq_content) related_str = (str(pq_content)) related_str_re = re.compile("\"rfs\":\[[^!]+\]") try: related_str_rfs = related_str_re.search(related_str).group() except AttributeError: LOGGER.debug(related_str) return None # ylog.debug(related_str_rfs) related_ls_re = re.compile("(:\[|,)(\"[A-Za-z\s\u4e00-\u9fa5]*\")") ls_related = related_ls_re.findall(related_str_rfs) RelatedKw = [x[1][1:-1] for x in ls_related] ylog.debug("related keywords: %s" % RelatedKw) # if pq_content is not None: # for item in pq_content('p._Bmc').items(): # href = item('a').attr('href') # if href: # o = urlparse(href, 'http') # if o.netloc: # kw = href # if href.startswith('/search?'): # href = parse_qs(o.query)['q'][0] # o = urlparse(href, 'http') # if o.path: # kw = href # RelatedKw.append(kw) return RelatedKw
def batch_upload(re, file_path, BATCH_SIZE, func, start, end): """batch upload categories or page Keyword Arguments: re -- regular expression source -- file path BATCH_SIZE -- func -- upload function start -- start position end -- end position """ # with open(file_path, 'r') as f: # print("reading all lines from sql") # total_line_size = len(f.readlines()) with open(file_path, 'rb') as f: for i, line in enumerate(tqdm(f)): line_start_position = 0 line_end_position = len(line) # try to process the whole line in a wile loop until it's done while True: if i < start: break # elif i <= end: try: test_string = line[line_start_position:].decode('utf-8') line_size = len(re.findall(test_string)) except UnicodeDecodeError as e: line_end_position = e.start ylog.debug('start at %s' % line_end_position) finally: string = line[ line_start_position:line_end_position].decode('utf-8') line_size = len(re.findall(string)) try: last_span = re.search(string).span()[0] except AttributeError: break line_size = len(re.findall(string)) for _ in range(0, line_size, BATCH_SIZE): # pause if find a file naed pause at the currend dir re_batch = {} for j in range(BATCH_SIZE): re_batch[j] = re.search(string, last_span) if re_batch[j] is not None: last_span = re_batch[j].span()[1] func(re_batch) line_end_position = len(line) line_start_position = line_end_position + 10 else: break
def find_meta(title, doi): """ find metadata with title or doi Keyword Arguments: title -- doi -- """ ylog.info(title) works = Works() w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: t = item.get('title')[0] sub_title = item.get('subtitle')[0] except: continue if SequenceMatcher(a=title, b=t).ratio() > 0.9 or SequenceMatcher( a=title, b=sub_title).ratio > 0.9: return item if i > 18: ylog.debug('[x]%s' % title) # ylog.debug(item['title']) return None
def upload_edge(ls_edges): """upload edge one by one Parameters: ls_edges -- list of edge tuples """ len_edges = len(ls_edges) uploaded_number = 0 batch_counter = 0 for edge_counter in tqdm(range(0, len_edges, BATCH_SIZE)): res = None error = None retry = 0 graph_upload_request = graphUpload_pb2.GraphUploadRequest() while res is None: try: graph_upload_request = graphUpload_pb2.GraphUploadRequest() for e in ls_edges[batch_counter:batch_counter + BATCH_SIZE]: node_from = e[0] node_to = e[1] edge_type = e[2] # page edge if edge_type == 0: edge = graph_upload_request.graph.edges.add() edge.props.type = "HasElement" edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_from) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus( node_to) # categories edge else: if node_from in IGNORE_CATEGORIES: continue edge = graph_upload_request.graph.edges.add() edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_from) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_to) edge.props.type = "HasSubset" graph_upload_request.uploadTag = "uploadWikiEdge" graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') res = gs_call.upload_graph(graph_upload_request) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % ( e.code, e.reason) else: raise except RETRIABLE_EXCEPTIONS as e: error = 'A retriable error occurred: %s' % e except GRAPH_EXCEPTIONS as e: ylog.debug('A graph error occurred: %s' % e) break if error is not None: print(error) retry += 1 res = None if retry > MAX_RETRIES: ylog.debug(res) exit("no loger attempting to retry.") max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) try: if res.edgeUpdateResultStatistics: ylog.debug(res.edgeUpdateResultStatistics) number = res.edgeUpdateResultStatistics.numOfCreations + \ res.edgeUpdateResultStatistics.numOfUpdates + \ res.edgeUpdateResultStatistics.numOfSkips uploaded_number += number if res.failedEdges: for err in res.failedEdges: ylog.debug(err) ylog.debug("start node: %s" % err.edge.startNodeID.primaryKeyInDomain) ylog.debug("end node: %s" % err.edge.endNodeID.primaryKeyInDomain) except: pass batch_counter += BATCH_SIZE return uploaded_number
# ylog.debug("test") batch_size = 2 # test fetch graph test_url = 'http://192.168.1.166:9080' prod_url = 'http://q.gftchina.com:13567/vqservice/vq/' test_user_name = 'wuwei' test_pwd = 'gft' gs_call = gftIO.GSCall(test_url, test_user_name, test_pwd) try: graph = gftIO.get_graph_from_neo4j('392482970E904D11190D208B7C22874A', server_url=test_url, user_name=test_user_name, pwd=test_pwd) except: pass # read sql file ylog.debug('reading sql files') # category category_path = "/home/weiwu/share/deep_learning/data/zhwiki_cat_pg_lk/zhwiki-latest-category.zhs.sql" category_sql = open(category_path, 'r') category = category_sql.read() category_sql.close() wiki_category_re = re.compile( "\(([0-9]+),('[^,]+'),([0-9]+),([0-9]+),([0-9]+)\)") wiki_category_size = len(wiki_category_re.findall(category)) # TODO: add counter of successful uploaded edges. ylog.debug('start uploading edges') last_span = wiki_category_re.search(category).span()[0]
ylog.console_on() ylog.filelog_on("app") works = Works() title = """Heterogeneous resistance to vancomycin in Staphylococcus epidermidis, Staphylococcus haemolyticus and Staphylococcus warneri clinical strains: characterisation""" w1 = works.query(title).sort('relevance').order('desc') i = 0 target_doi = '10.1109/icdcs.2006.48' items_result = None for item in w1: i = i + 1 try: t = item.get('title')[0] sub_title = item.get('subtitle')[0] ylog.debug('crossref item title ') ylog.debug(t) ylog.debug(sub_title) except: ylog.debug(item) continue if SequenceMatcher(a=title, b=t).ratio() > 0.8: found_doi = item['DOI'] ylog.debug("target doi: %s" % target_doi) ylog.debug("found doi: %s" % found_doi) if target_doi[:10] == found_doi[:10] or SequenceMatcher( a=target_doi, b=found_doi).ratio() > 0.9: print('found') break if i > 0: ylog.debug('[x]%s' % title)
def risk_model(df_ret, dict_risk_expo, capital, corr_half_life, var_half_life): """ Regression stock return by previous factor exposure, to get factor return covariance and residual. Pseudo code: 1. process input data, parse, drop and fill. 2. get intersection of all factor names, all symbol names, all dates. 3. Solve the problem of heteroskedasticity by square root the market capitalization. Handbook p5, p15. new return = square root of market capitalization * stock return, add a constraint column to new return. calculate factor return. calculate factor return covariance. calculate the residual(specific) variances of regression. generate final return value. Keyword Arguments: df_ret -- pd.DataFrame, stock daily return. dict_risk_expo -- dictionary, factor exposure, key=factor. capital -- pd.DataFrame, stock market capital, to calculate weight. corr_half_life -- int, to compare correlation half life. var_half_life -- int, to compare variance half life. Return: 27 industrial factors + 8 style factors return -- pd.DataFrame ret_cov -- pd.DataFrame, return covariance specificRisk -- pd.DataFrame, residual """ # get all factor names ylog.debug('parse data') ls_fexponame = list( map(gftIO.gidInt2Str, list(dict_risk_expo['osets'].asColumnTab()['O0']))) ind_factor_name = sorted( list( map(gftIO.gidInt2Str, list(dict_risk_expo[ls_fexponame[0]].asColumnTab()['O0'])))) sty_factor_name = sorted( list( map(gftIO.gidInt2Str, list(dict_risk_expo[ls_fexponame[1]].asColumnTab()['O0'])))) allfactor = ind_factor_name + sty_factor_name ##stock return preprocess df_w_ret = df_ret.asMatrix().T.dropna(how='all', axis=1) ##get factor exposure date list(all snapshots) dict_risk_expo_new = { factorname: dict_risk_expo[factorname].asMatrix().dropna(how='all') for factorname in allfactor } ls_ls_fexpodate = list([ dict_risk_expo_new[factorname].index.tolist() for factorname in dict_risk_expo_new.keys() ]) ls_alldates_fexpo = reduce(np.intersect1d, ls_ls_fexpodate) ## get factor exposure symbol list ls_ls_fexposymbol = list([ dict_risk_expo_new[factorname].columns.tolist() for factorname in dict_risk_expo_new.keys() ]) ls_allsymbols_fexpo = reduce(np.intersect1d, ls_ls_fexposymbol) ##weight preprocess weight = capital.asMatrix().T ##get the date/symbol intersection of (stock return,factor exposure,capital) ##ls_alldates save the stock return map date ##get fexpo date,find the nearest business day fexpodate = pd.DataFrame(ls_alldates_fexpo, columns=['date_fexpo']) retdate = pd.DataFrame(df_w_ret.columns, columns=['date_ret']) retdate.sort_values("date_ret", ascending=True, inplace=True) fexpodate.sort_values("date_fexpo", ascending=True, inplace=True) df_date_map = pd.merge_asof(retdate, fexpodate, left_on="date_ret", right_on="date_fexpo", allow_exact_matches=False) df_date_map.dropna(how='any', inplace=True) df_date_map = df_date_map.drop_duplicates( subset='date_fexpo').reset_index() dict_date_map = { df_date_map.date_fexpo[i]: df_date_map.date_ret[i] for i in range(len(df_date_map)) } ls_alldates = sorted( list( set(capital.columns).intersection(set( df_w_ret.columns)).intersection(set(dict_date_map.values())))) ls_alldates_ondaybefore = sorted(list(dict_date_map.keys())) ##get daily symbol list ls_allsymbols = { date: list( set(df_w_ret[[dict_date_map[date]]].dropna().index).intersection( set(ls_allsymbols_fexpo)).intersection(set(capital.index))) for date in ls_alldates_ondaybefore } ## align the stock return and factor exposure dict_df_capital_raw = { date: capital[[date]].reindex(index=ls_allsymbols[date]).fillna(0) for date in ls_alldates_ondaybefore } dict_df_capital = { date: np.sqrt(dict_df_capital_raw[date]) for date in ls_alldates_ondaybefore } dict_df_ret = { dict_date_map[date]: pd.concat([(df_w_ret[[dict_date_map[date] ]].reindex(index=ls_allsymbols[date])) * (dict_df_capital[date].rename( columns={date: dict_date_map[date]})), pd.DataFrame(data=np.zeros(1), index=['constrain'], columns=[dict_date_map[date]])], axis=0) for date in ls_alldates_ondaybefore } dict_df_fexpo_raw = { date: fexpomerge(dict_risk_expo_new, date, allfactor, ls_allsymbols) for date in ls_alldates_ondaybefore } dict_df_fexpo = { date: dict_df_fexpo_raw[date].assign(countryfactor=1).multiply( dict_df_capital[date].squeeze(), axis='index') for date in ls_alldates_ondaybefore } ##calculate constraints dict_df_fexpo_con = { date: expoconstrain(dict_df_fexpo_raw, date, ind_factor_name, allfactor, dict_df_capital_raw, sty_factor_name, dict_df_fexpo) for date in ls_alldates_ondaybefore } # for i in dict_risk_expo_new.keys(): # if dict_risk_expo_new[i].index.min() > df_l_ret.index.min( # ) or dict_risk_expo_new[i].index.max() < df_l_ret.index.max(): # raise Exception ########################step3:calculate factor return######################## ls_df_fitresult = { dict_date_map[date]: Regression(date, dict_df_ret, dict_df_fexpo_con, dict_df_capital, dict_df_fexpo, dict_date_map) for date in ls_alldates_ondaybefore } ls_df_facreturn = list( ls_df_fitresult[date]['params'].rename(columns={'params': date}) for date in ls_alldates) df_model_params = reduce( lambda df_para1, df_para2: pd.concat([df_para1, df_para2], axis=1), ls_df_facreturn) ########################step4:calculate factor return covariance######################## df_allfactorret = df_model_params.T df_allfactorret = df_allfactorret.sort_index() corrhalflife = int(corr_half_life) varhalflife = int(var_half_life) halflife = max(corrhalflife, varhalflife) if len(ls_alldates) < halflife: raise Exception("More data needed") else: ls_alldatesnew = ls_alldates[halflife - 1:len(ls_alldates)] corrwgts = list( map(lambda x: mt.sqrt(0.5**(x / int(corrhalflife))), list(range(int(corrhalflife) - 1, -1, -1)))) varwgts = list( map(lambda x: mt.sqrt(0.5**(x / int(varhalflife))), list(range(int(varhalflife) - 1, -1, -1)))) ls_factorretcov = list( calcfactorRetCov(df_allfactorret, date, corrwgts, varwgts, corrhalflife, varhalflife) for date in ls_alldatesnew) df_l_factorretcov = pd.concat( ls_factorretcov, axis=0).rename(columns={'variable': 'factorid2'}) ########################step5:calculate the residual(specific) variances of regression######################## ##part1:merge factorreturn,factor exposure and stock return ls_specificrisk = list( ls_df_fitresult[date]['resid'].rename(columns={'resid': date}) for date in ls_alldates) df_w_specificrisk = pd.concat(ls_specificrisk, axis=1).T df_w_specificrisk = df_w_specificrisk.sort_index() specificwgts = list( map(lambda x: mt.sqrt(0.5**(x / int(halflife))), list(range(int(halflife) - 1, -1, -1)))) ls_factorretspe = list( calcfactorRetSpe(df_w_specificrisk, date, specificwgts, halflife) for date in ls_alldatesnew) df_specificrisk_var = pd.concat(ls_factorretspe, axis=0) ########################step6:generate final return value######################## df_allfactorret = df_allfactorret.drop('countryfactor', axis=1) dict_factorret = { key + '.ret': df_allfactorret[[key]].rename( columns={ key: list( gftIO.strSet2Np( np.array(list(df_allfactorret[[key]].columns))))[0] }) for key in df_allfactorret.columns } dictMerged = dict( dict_factorret, **{ 'ret_cov': df_l_factorretcov, 'specificRisk': df_specificrisk_var }) return dictMerged
import statsmodels.regression.linear_model as lm import statsmodels.api as sm from lib.gftTools import gftIO import datetime import pandas as pd import numpy as np import re import os import warnings from functools import reduce import math as mt import statsmodels.api as sm from lib.gftTools import gftIO import datetime import logging from ylib import ylog ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on("app") risk_model_path = '/home/weiwu/share/risk_model/' x0 = gftIO.zload(os.path.join(risk_model_path, 'stock_return.pkl')) x1 = gftIO.zload(os.path.join(risk_model_path, 'factors.pkl') x2 = gftIO.zload(os.path.join(risk_model_path, 'market_capital.pkl')) x3 = 4 x4 = 5 ylog.debug('parse data')
def find_meta(self, identifier): """ find metadata with title or DOI Keyword Arguments: identifier -- """ try: # verify=False is dangerous but sci-hub.io # requires intermediate certificates to verify # and requests doesn't know how to download them. # as a hacky fix, you can add them to your store # and verifying would work. will fix this later. url = self.base_url + identifier['article_link'] self.sess.headers = {'user-agent': self.get_random_user_agent()} res = self.sess.get(url, verify=False, allow_redirects=False) re_bracket = re.compile("\[(.*?)\]\s") title = re.sub(re_bracket, "", identifier['name']) ylog.debug('*' * 80) ylog.debug("title: %s" % title) ylog.debug(res.status_code) # self.out.ix[title]['status_code'] = res.status_code ylog.debug("headers: %s" % res.headers['Content-Type']) ylog.debug('location: %s' % res.headers.get("Location")) # self.out.ix[title]['location'] = res.headers.get("Location") search_title = True if not res.headers.get("Location"): content = res.content if len(content) > 2: import cchardet charset = cchardet.detect(content) text = content.decode(charset['encoding']) soup = BeautifulSoup(text, "lxml") script = soup.script.get_text() doi_regexp = '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+' try: doi_match = re.compile(doi_regexp).findall(script)[0] ylog.info("DOI: %s" % doi_match) search_title = False # use crossref API to get metadata works = Works() w1 = works.query(doi_match).sort('relevance').order( 'desc') i = 0 for item in w1: # TODO: verify title # self.out.ix[title]['DOI'] = item['DOI'] return {'meta': item['DOI'], 'url': url} except IndexError: ylog.debug('failed to find regexp') elif search_title: works = Works() w1 = works.query(title).sort('relevance').order('desc') i = 0 for item in w1: i = i + 1 try: # ylog.debug('crossref item title ') t = item.get('title')[0] # ylog.debug(t) sub_title = item.get('subtitle')[0] # ylog.debug(sub_title) # ylog.debug("ratio: %s" % # (SequenceMatcher(a=title, b=t).ratio())) except TypeError: sub_title = '' if SequenceMatcher( a=title, b=t).ratio() > 0.9 or SequenceMatcher( a=title, b=sub_title).ratio( ) > 0.9 or t.startswith(title): ylog.debug("DOI %s" % item['DOI']) # self.out.ix[title]['DOI'] = item['DOI'] return {'meta': item['DOI'], 'url': url} if i > 18: # ylog.debug('[x]%s' % title) # ylog.debug(item['title']) return None except requests.exceptions.ConnectionError: logger.info('{} cannot acess,changing'.format( self.available_base_url_list[0])) self._change_base_url() except requests.exceptions.RequestException as e: return { 'err': 'Failed to fetch pdf with identifier %s (resolved url %s) due to request exception.' % (identifier, url) }
from tempfile import gettempdir tmp_dir = gettempdir() output = open(tmp_dir + '/test.txt', 'w') logging.info("extract pages") for page_id in ls_pageid: logging.info(page_id) try: text = gs_call.get_nodes_binary_data([page_id]) except DecodeError: continue page = text.entries[0].data.data.decode('utf-8') text = preprocess_string(page) # ylog.debug(text) output.write(text + '\n') output.close() if __name__ == '__main__': # ylog.set_level(logging.DEBUG) # ylog.console_on() # ylog.filelog_on("wiki_upload") gs_call = gftIO.GSCall(prod_url, test_user_name, test_pwd) cat_path = user_path + '/share/deep_learning/data/GID/cat.txt' page_path = user_path + "/share/deep_learning/data/GID/page.txt" page_gid_file = open(page_path) lines = page_gid_file.read().splitlines() page_gid = [s.strip() for s in lines] for gid in page_gid: ylog.debug(gid) extract_pages(gid, gs_call)
def upload_node(dict_re_match_object): """ upload regular expression object in the dictionary in a batch. 1. get each value from the input dictionary. 2. create a graph upload request. 3. fill node properties. use encoded original Chinese title plus url as url property. 4. if there's any error upload response, retry. 5. print upload statistics. Keyword Arguments: re_match_object -- re object """ res = None error = None re_upload_error = None retry = 0 nodes_fail_retry = 0 uploaded_number = 0 while res is None: try: graph_upload_request = graphUpload_pb2.GraphUploadRequest() # iterate nodes batch for index, value in dict_re_match_object.items(): if value is not None: item = dict_re_match_object.get(index) # print(item) title = item.group()[1:-1] zh_title = HanziConv.toSimplified(title) # if zh_title in IGNORE_CATEGORIES: # break node = graph_upload_request.graph.nodes.add() node.props.type = "readonlyDoc" # p1 = node.props.props.entries.add() # p1.key = "url" # p1.value = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus( # title) p2 = node.props.props.entries.add() p2.key = "_s_import_source" p2.value = "word2vec model" node.businessID.url = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus( title) node.names.chinese = zh_title # other information of the upload request graph_upload_request.uploadTag = "UploadWord2VecVocabNodes" graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') res = gs_call.upload_graph(graph_upload_request) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % (e.code, e.reason) else: raise except RETRIABLE_EXCEPTIONS as e: error = 'A retriable error occurred: %s' % e try: if res.failedNodes: re_upload_error = "some nodes failed to upload %s" % res.failedNodeds except: pass if re_upload_error is not None: print(re_upload_error) nodes_fail_retry += 1 res = None if nodes_fail_retry > NODES_FAIL_MAX_RETRIES: ylog.debug(res) res = "continue" if error is not None: print(error) retry += 1 res = None if retry > MAX_RETRIES: ylog.debug(res) # break # exit("no loger attempting to retry.") ylog.debug(res) max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) # ylog.debug(res) # jump out while response is None: try: if res.nodeUpdateResultStatistics: ylog.debug(res.nodeUpdateResultStatistics) uploaded_number = res.nodeUpdateResultStatistics.numOfCreations + \ res.nodeUpdateResultStatistics.numOfUpdates + \ res.nodeUpdateResultStatistics.numOfSkips if res.uploadedNodes: for updated in res.uploadedNodes: ylog.debug("uploaded node GID: %s" % updated.gid) if res.failedNodes: for err in res.failedNodes: if err.error.errorCode != 202001: ylog.info(err.error) ylog.debug(err.error) except: pass return uploaded_number
def delete_edge(dict_re_match_object): """ delete edge regular expression object in the dictionary in a batch. 1. get each value from the input dictionary. 2. create a graph upload request. 3. fill edge properties. set edge start node and end node. 4. if there's any error upload response, retry. 5. print upload statistics. (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page') Keyword Arguments: re_match_object -- re object """ uploaded_number = 0 for index, value in dict_re_match_object.items(): if value is not None: item = dict_re_match_object.get(index) edge_type = item.group(7)[1:-1] del_edge_type = None if edge_type == 'page': page_title = item.group(3)[1:-1] cat_title = item.group(2)[1:-1] if '\\n' in cat_title: end = cat_title.split("\\n") cat_title = end[-1] if '\\n' in page_title: end = page_title.split("\\n") page_title = end[-1] page_title = page_title.replace(" ", "_") startNodeID_domain = "https://zh.wikipedia.org/wiki/Category:" startNodeID_primaryKeyInDomain = cat_title endNodeID_domain = "https://zh.wikipedia.org/wiki/" endNodeID_primaryKeyInDomain = page_title del_edge_type = "HasElement" if edge_type == 'subcat': subcat_title = item.group(3)[1:-1] cat_title = item.group(2)[1:-1] if '\\n' in cat_title: end = cat_title.split("\\n") cat_title = end[-1] if '\\n' in subcat_title: end = subcat_title.split("\\n") subcat_title = end[-1] subcat_title = subcat_title.replace(" ", "_") subcat_title_zh = HanziConv.toSimplified(subcat_title) cat_title_zh = HanziConv.toSimplified(cat_title) startNodeID_domain = "https://zh.wikipedia.org/wiki/Category:" startNodeID_primaryKeyInDomain = cat_title endNodeID_domain = "https://zh.wikipedia.org/wiki/Category:" endNodeID_primaryKeyInDomain = subcat_title del_edge_type = "HasSubset" if del_edge_type is not None: start_node_pk = startNodeID_domain + "/" + startNodeID_primaryKeyInDomain end_node_pk = endNodeID_domain + "/" + endNodeID_primaryKeyInDomain start_node_hash = hashlib.md5( start_node_pk.encode('utf-8')).hexdigest().upper() end_node_hash = hashlib.md5( end_node_pk.encode('utf-8')).hexdigest().upper() get_or_else = "" get_source = "" get_target = "" edge_str = "|".join([ start_node_hash, end_node_hash, del_edge_type, get_or_else, get_source, get_target ]) edge_md5 = hashlib.md5( edge_str.encode('utf-8')).hexdigest().upper() del_edge_type = None res = None error = None retry = 0 while res is None: try: res = gs_call.delete_edge(edge_md5, False) except GSError as e: # error = 'edge not existed' res = 'failed' ylog.debug('failed %s from %s to %s' % (edge_md5, start_node_hash, end_node_hash)) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % ( e.code, e.reason) else: raise else: res = 'success' ylog.debug('deleted %s from %s to %s' % (edge_md5, start_node_hash, end_node_hash)) if error is not None: print(error) retry += 1 # res = None if retry > MAX_RETRIES: ylog.debug(res) exit("no loger attempting to retry.") max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) if res == 'success': uploaded_number += 1 ylog.debug('deleted %s from %s to %s' % (edge_md5, start_node_hash, end_node_hash)) return uploaded_number
def upload_edge(dict_re_match_object): """ upload edge regular expression object in the dictionary in a batch. 1. get each value from the input dictionary. 2. create a graph upload request. 3. fill edge properties. set edge start node and end node. 4. if there's any error upload response, retry. 5. print upload statistics. (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page') (id, from, to,...) Keyword Arguments: re_match_object -- re object """ res = None error = None re_upload_error = None retry = 0 nodes_fail_retry = 0 uploaded_number = 0 while res is None: try: graph_upload_request = graphUpload_pb2.GraphUploadRequest() # iterate nodes batch for index, value in dict_re_match_object.items(): if value is not None: item = dict_re_match_object.get(index) edge_type = item.group(7)[1:-1] if edge_type == 'page': page_title = item.group(3)[1:-1] cat_title = item.group(2)[1:-1] if '\\n' in cat_title: end = cat_title.split("\\n") cat_title = end[-1] if '\\n' in page_title: end = page_title.split("\\n") page_title = end[-1] page_title = page_title.replace(" ", "_") page_title_zh = HanziConv.toSimplified(page_title) cat_title_zh = HanziConv.toSimplified(cat_title) # if not cat_title_zh in EXAMPLE_CATEGORIES_PAGE_DICT: # continue edge = graph_upload_request.graph.edges.add() edge.props.type = "HasElement" edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( cat_title) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus( page_title) if edge_type == 'subcat': subcat_title = item.group(3)[1:-1] cat_title = item.group(2)[1:-1] if '\\n' in cat_title: end = cat_title.split("\\n") cat_title = end[-1] if '\\n' in subcat_title: end = subcat_title.split("\\n") subcat_title = end[-1] subcat_title = subcat_title.replace(" ", "_") subcat_title_zh = HanziConv.toSimplified(subcat_title) cat_title_zh = HanziConv.toSimplified(cat_title) # if not cat_title_zh in EXAMPLE_CATEGORIES_PAGE_DICT: # continue if subcat_title_zh == cat_title_zh: continue edge = graph_upload_request.graph.edges.add() edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( cat_title) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( subcat_title) edge.props.type = "HasSubset" graph_upload_request.uploadTag = "uploadWikiEdge" graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') res = gs_call.upload_graph(graph_upload_request) # ylog.debug(res) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % (e.code, e.reason) else: raise except RETRIABLE_EXCEPTIONS as e: error = 'A retriable error occurred: %s' % e except GRAPH_EXCEPTIONS as e: break # try: # if res.failedEdges: # re_upload_error = "some nodes failed to upload %s" % res.failedEdges # except: # pass # if re_upload_error is not None: # print(re_upload_error) # nodes_fail_retry += 1 # res = None # if nodes_fail_retry > NODES_FAIL_MAX_RETRIES: # ylog.debug(res) # res = "continue" if error is not None: print(error) retry += 1 res = None if retry > MAX_RETRIES: ylog.debug(res) ylog.debug("no loger attempting to retry.") error = None # exit("no loger attempting to retry.") max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) try: if res.edgeUpdateResultStatistics: ylog.debug(res.edgeUpdateResultStatistics) uploaded_number = res.edgeUpdateResultStatistics.numOfCreations + \ res.edgeUpdateResultStatistics.numOfUpdates + \ res.edgeUpdateResultStatistics.numOfSkips if res.failedEdges: for err in res.failedEdges: ylog.debug(err) ylog.debug("start node: %s" % err.edge.startNodeID.primaryKeyInDomain) ylog.debug("end node: %s" % err.edge.endNodeID.primaryKeyInDomain) except: pass return uploaded_number
ls_edges = [] for e in tqdm(graph.edges): node_from = e[0] node_to = e[1] edge_type = graph[node_from][node_to]['subtype'] ls_edges.append(tuple([node_from, node_to, edge_type])) import pickle with open('graph_whole.pkl', 'wb') as fp: pickle.dump(ls_edges, fp) return ls_edges wiki_category_re = re.compile( "\(([0-9]+),('[^,]+'),([0-9]+),([0-9]+),([0-9]+)\)") ylog.debug('create graph nodes') batch_upload(wiki_category_re, category_path, 200, add_node, start=0, end=10000000) ylog.debug('create graph edges') batch_upload(wiki_category_link_re, category_link_path, 200, add_edge, start=0, end=10000000) d = defaultdict(list)
query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.count is not None: options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) query.set_num_page_results(options.count) querier.send_query(query) if options.csv: csv(querier) elif options.csv_header: csv(querier, header=True) elif options.citation is not None: citation_export(querier) else: txt(querier, with_globals=options.txt_globals) if options.cookie_file: querier.save_cookies() return 0 if __name__ == "__main__": ylog.debug('start') sys.exit(main()) # main([ # '-c', '1', '--author', "albert einstein", '--phrase', 'quantum theory' # ])
# test fetch graph test_url = 'http://192.168.1.166:9080' prod_url = 'http://q.gftchina.com:13567/vqservice/vq/' test_user_name = 'wuwei' test_pwd = 'gft' gs_call = gftIO.GSCall(test_url, test_user_name, test_pwd) try: graph = gftIO.get_graph_from_neo4j('392482970E904D11190D208B7C22874A', server_url=test_url, user_name=test_user_name, pwd=test_pwd) except: pass # read sql file ylog.debug('reading sql files') # category category_path = "/home/weiwu/share/deep_learning/data/zhwiki_cat_pg_lk/zhwiki-latest-category.zhs.sql" category_sql = open(category_path, 'r') category = category_sql.read() category_sql.close() wiki_category_re = re.compile( "\(([0-9]+),('[^,]+'),([0-9]+),([0-9]+),([0-9]+)\)") wiki_category = wiki_category_re.findall(category) # page page_path = "/home/weiwu/share/deep_learning/data/zhwiki_cat_pg_lk/zhwiki-latest-page.zhs.sql" page_sql = open(page_path, 'r') page = page_sql.read() page_sql.close() wiki_page_re = re.compile(
def main(): usage = """scholar.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: scholar.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: scholar.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup( parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option( '-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option( '-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option( '-s', '--some', metavar='WORDS', default=None, help= 'Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases' ) group.add_option( '-n', '--none', metavar='WORDS', default=None, help= 'Results must contain none of these words. See -s|--some re. formatting' ) group.add_option( '-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option( '-t', '--title-only', action='store_true', default=False, help='Search title only') group.add_option( '-P', '--pub', metavar='PUBLICATIONS', default=None, help='Results must have appeared in this publication') group.add_option( '--after', metavar='YEAR', default=None, help='Results must have appeared in or after given year') group.add_option( '--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') group.add_option( '--no-patents', action='store_true', default=False, help='Do not include patents in results') group.add_option( '--no-citations', action='store_true', default=False, help='Do not include citations in results') group.add_option( '-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option( '-c', '--count', type='int', default=None, help='Maximum number of results') parser.add_option_group(group) group = optparse.OptionGroup( parser, 'Output format', 'These options control the appearance of the results.') group.add_option( '--txt', action='store_true', help='Print article data in text format (default)') group.add_option( '--txt-globals', action='store_true', help='Like --txt, but first print global results too') group.add_option( '--csv', action='store_true', help='Print article data in CSV form (separator is "|")') group.add_option( '--csv-header', action='store_true', help='Like --csv, but print header with column names') group.add_option( '--citation', metavar='FORMAT', default=None, help= 'Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).' ) parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') group.add_option( '--cookie-file', metavar='FILE', default=None, help= 'File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.' ) group.add_option( '-d', '--debug', action='count', default=0, help= 'Enable verbose logging to stderr. Repeated options increase detail of debug output.' ) group.add_option( '-v', '--version', action='store_true', default=False, help='Show version information') parser.add_option_group(group) options, _ = parser.parse_args() ylog.debug(options) # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug']) ScholarConf.LOG_LEVEL = options.debug ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL) if options.version: print('This is scholar.py %s.' % ScholarConf.VERSION) return 0 if options.cookie_file: ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if options.author or options.allw or options.some or options.none \ or options.phrase or options.title_only or options.pub \ or options.after or options.before: print( 'Cluster ID queries do not allow additional search arguments.') return 1 querier = ScholarQuerier() settings = ScholarSettings() if options.citation == 'bt': settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) elif options.citation == 'rm': settings.set_citation_format(ScholarSettings.CITFORM_REFMAN) elif options.citation == 'rw': settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print( 'Invalid citation link format, must be one of "bt", "en", "rm", or "rw".' ) return 1 querier.apply_settings(settings) if options.cluster_id: query = ClusterScholarQuery(cluster=options.cluster_id) else: query = SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.count is not None: options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) query.set_num_page_results(options.count) querier.send_query(query) if options.csv: csv(querier) elif options.csv_header: csv(querier, header=True) elif options.citation is not None: citation_export(querier) else: txt(querier, with_globals=options.txt_globals) if options.cookie_file: querier.save_cookies() return 0
def upload_edge_from_graph(ls_edges, batch_size): """ upload edge regular expression object in the dictionary in a batch. 1. get each value from the input dictionary. 2. create a graph upload request. 3. fill edge properties. set edge start node and end node. 4. if there's any error upload response, retry. 5. print upload statistics. (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page') (id, from, to,...) Keyword Arguments: re_match_object -- re object """ """upload edge one by one Parameters: ls_edges -- list of edge tuples """ len_edges = len(ls_edges) uploaded_number = 0 batch_counter = 0 for edge_counter in tqdm(range(0, len_edges, batch_size)): res = None error = None re_upload_error = None retry = 0 nodes_fail_retry = 0 graph_upload_request = graphUpload_pb2.GraphUploadRequest() while res is None: try: graph_upload_request = graphUpload_pb2.GraphUploadRequest() for e in ls_edges[batch_counter:batch_counter + batch_size]: node_from = e[0] node_to = e[1] edge_type = e[2] # page edge if edge_type == 0: edge = graph_upload_request.graph.edges.add() edge.props.type = "HasElement" edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_from) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus( node_to) # categories edge else: if node_from in IGNORE_CATEGORIES: continue edge = graph_upload_request.graph.edges.add() edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_from) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_to) edge.props.type = "HasSubset" graph_upload_request.uploadTag = "uploadWikiEdge" graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') res = gs_call.upload_graph(graph_upload_request) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % ( e.code, e.reason) else: raise except RETRIABLE_EXCEPTIONS as e: error = 'A retriable error occurred: %s' % e except GRAPH_EXCEPTIONS as e: ylog.debug('A graph error occurred: %s' % e) break if error is not None: print(error) retry += 1 res = None if retry > MAX_RETRIES: ylog.debug(res) # exit("no loger attempting to retry.") ylog.debug("no loger attempting to retry.") error = None max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) try: if res.edgeUpdateResultStatistics: ylog.debug(res.edgeUpdateResultStatistics) number = res.edgeUpdateResultStatistics.numOfCreations + \ res.edgeUpdateResultStatistics.numOfUpdates + \ res.edgeUpdateResultStatistics.numOfSkips uploaded_number += number if res.failedEdges: for err in res.failedEdges: print(err) print("start node: %s" % err.edge.startNodeID.primaryKeyInDomain) print("end node: %s" % err.edge.endNodeID.primaryKeyInDomain) except: pass batch_counter += batch_size return uploaded_number
from tqdm import tqdm import time import json import networkx as nx ylog.set_level(logging.DEBUG) ylog.console_on() ylog.filelog_on("cycles") graph = nx.read_gexf('whole_edges.no_loops.gexf') ls_nodes = list(graph.nodes) counter = 0 total_nodes_num = 287966 rm_counter = 0 try: while True: ylog.debug('rm cycles loops number %s' % counter) for node in tqdm(ls_nodes): removed_counter = 0 ylog.debug('rm cycles of node %s' % node) while True: try: ls_loop = nx.find_cycle(graph, node) # remove direct edge: ylog.debug(ls_loop) if len(ls_loop) == 2: if ls_loop[0][0] == ls_loop[1][1] and ls_loop[0][ 1] == ls_loop[1][0]: graph.remove_edge(ls_loop[0][0], ls_loop[0][1]) # remove big loop:
def batch_upload(re, file_path, batch_size, func, start, end): """batch upload categories or edge. 1. read sql file line by line. 2. extract target string using regular expression. 3. put these target content into a dictionary. 4. use the upload function with dict as input. Keyword Arguments: re -- regular expression source -- file path batch_size -- func -- upload function start -- start position end -- end position """ uploaded_number = 0 try: # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe5 in position 7629: invalid continuation byte with open(file_path, 'rb') as f: for i, line in enumerate(tqdm(f)): line_start_position = 0 line_end_position = len(line) # try to process the whole line in a wile loop until it's done while True: if i < start: break # elif i <= end: try: test_string = line[line_start_position:].decode( 'utf-8') line_size = len(re.findall(test_string)) except UnicodeDecodeError as e: line_end_position = e.start ylog.debug('start at %s' % line_end_position) finally: string = line[line_start_position: line_end_position].decode('utf-8') line_size = len(re.findall(string)) try: last_span = re.search(string).span()[0] except AttributeError: break line_size = len(re.findall(string)) for _ in range(0, line_size, batch_size): # pause if find a file naed pause at the currend dir re_batch = {} for j in range(batch_size): re_batch[j] = re.search(string, last_span) if re_batch[j] is not None: last_span = re_batch[j].span()[1] uploaded_count = func(re_batch) uploaded_number += uploaded_count line_end_position = len(line) line_start_position = line_end_position + 10 else: break except KeyboardInterrupt: print("uploaded number: %s" % uploaded_number) try: sys.exit(0) except SystemExit: os._exit(0) return uploaded_number
query=quote_plus(query), language=language, start=start) requests.packages.urllib3.disable_warnings( requests.packages.urllib3.exceptions.InsecureRequestWarning) r = requests.get(url=url, proxies=proxies, headers=headers, allow_redirects=False, verify=False, timeout=30) time.sleep(5) except requests.exceptions.SSLError as e: print(e) # LOGGER.info(url) ylog.debug(domain) time.sleep(5) continue LOGGER.info(url) content = r.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) bsObj = BeautifulSoup(text, "lxml") # result counts brief_counts = bsObj.find_all('div', id='gs_ab_md')[0].text print(brief_counts) text1 = brief_counts.replace(r',', "") pattern = re.compile(u'\d+') result_count = re.findall(pattern, text1)[0] print(result_count)
def upload_single_edge(e): res = None error = None retry = 0 while res is None: try: graph_upload_request = graphUpload_pb2.GraphUploadRequest() node_from = e[0] node_to = e[1] edge_type = e[2] if edge_type == 0: edge = graph_upload_request.graph.edges.add() edge.props.type = "HasElement" edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_from) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus( node_to) # categories edge else: if node_from in IGNORE_CATEGORIES: break edge = graph_upload_request.graph.edges.add() edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_from) edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus( node_to) edge.props.type = "HasSubset" graph_upload_request.uploadTag = "uploadWikiEdge" graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value( 'UPDATE') res = gs_call.upload_graph(graph_upload_request) print(res) except HTTPError as e: if e.code in RETRIABLE_STATUS_CODES: error = 'A retriable HTTP error %d occurred:\n%s' % (e.code, e.reason) else: raise except RETRIABLE_EXCEPTIONS as e: error = 'A retriable error occurred: %s' % e except GRAPH_EXCEPTIONS as e: break if error is not None: print(error) retry += 1 res = None if retry > MAX_RETRIES: ylog.debug(res) exit("no loger attempting to retry.") max_sleep = 2**retry sleep_seconds = random.random() * max_sleep print('Sleeping %f seconds and then retrying...' % sleep_seconds) time.sleep(sleep_seconds) try: if res.edgeUpdateResultStatistics: ylog.debug(res.edgeUpdateResultStatistics) uploaded_number = res.edgeUpdateResultStatistics.numOfCreations + \ res.edgeUpdateResultStatistics.numOfUpdates + \ res.edgeUpdateResultStatistics.numOfSkips ylog.debug(e) # if res.failedEdges: # for err in res.failedEdges: # print(err) # print( # "start node: %s" % err.edge.startNodeID.primaryKeyInDomain) # print("end node: %s" % err.edge.endNodeID.primaryKeyInDomain) except: pass
logger.setLevel(logging.DEBUG) risk_model_path = '/home/weiwu/share/risk_model/' # keep from double loading stock_return = gftIO.zload(os.path.join(risk_model_path, 'stock_return.pkl')) factors = gftIO.zload(os.path.join(risk_model_path, 'factors.pkl')) market_capital = gftIO.zload( os.path.join(risk_model_path, 'market_capital.pkl')) corr_half_life = gftIO.zload( os.path.join(risk_model_path, 'corr_half_life.pkl')) var_half_life = gftIO.zload(os.path.join(risk_model_path, 'var_half_life.pkl')) model = risk_model(stock_return, factors, market_capital, corr_half_life, var_half_life) ylog.debug('parse data') # get all factor names ls_fexponame = factors['osets'].asColumnTab()['O0'].apply( gftIO.gidInt2Str).tolist() ind_factor_name = factors[ls_fexponame[0]].asColumnTab()['O0'].apply( gftIO.gidInt2Str).tolist() style_factor_name = factors[ls_fexponame[1]].asColumnTab()['O0'].apply( gftIO.gidInt2Str).tolist() allfactor = ind_factor_name + style_factor_name ##stock return preprocess if isinstance(stock_return, gftIO.GftTable): # df_w_ret = stock_return.asMatrix().T.dropna(how='all', axis=1) df_stock_return = stock_return.asMatrix().dropna(axis=1, how='all')
def search(self, query, limit=10, download=False): """ Performs a query on scholar.google.com, and returns a dictionary of results in the form {'papers': ...}. Unfortunately, as of now, captchas can potentially prevent searches after a certain limit. """ start = 0 results = {'papers': []} while True: try: self.sess.headers = {'user-agent': self.get_random_user_agent()} res = self.sess.get( SCHOLARS_BASE_URL, allow_redirects=True, params={ 'q': query, 'hl': 'en', 'start': start, 'as_sdt': '0,5' }) ylog.debug(res.url) except requests.exceptions.RequestException as e: results[ 'err'] = 'Failed to complete search with query %s (connection error)' % query return results s = self._get_soup(res.content) papers = s.find_all('div', class_="gs_r") if not papers: if 'CaptchaRedirect' in res.content: results[ 'err'] = 'Failed to complete search with query %s (captcha)' % query return results for paper in papers: if not paper.find('table'): source = None pdf = paper.find('div', class_='gs_ggs gs_fl') link = paper.find('h3', class_='gs_rt') # find link type, try: url_type = paper.find( 'span', class_='gs_ctg2').get_text()[1:-1] except: url_type = None if pdf: source = pdf.find('a')['href'] elif link.find('a'): source = link.find('a')['href'] else: continue article_link = link.find('a')['href'] results['papers'].append({ 'name': re.sub(self.re_bracket, "", link.text.replace("\xa0…", "")), 'url': source, 'article_link': article_link, 'type': url_type }) if len(results['papers']) >= limit: return results start += 10
uploaded_number = batch_upload( wiki_category_re, category_path, batch_size, upload_cat_node, start=0, end=6080000000) print("uploaded number: %s" % (uploaded_number)) # upload edge # ylog.debug('reading link sql file') # with open("graph_no_loop.pkl", 'rb') as fp: # itemlist = pickle.load(fp) ylog.debug("uploading wiki categorie page link") category_link_path = './data/zhwiki-latest-categorylinks.zhs.sql' wiki_category_link_re = re.compile( "\(([0-9]+),('[^,]+'),('[^']+'),('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'),('[^']*'),('[^,]+'),('[^,]+')\)" ) # for i in tqdm(itemlist[5308253:]): # upload_single_edge(i) # uploaded_number = upload_edge_from_graph(itemlist[int(sys.argv[2]):], # int(sys.argv[1])) uploaded_number = batch_upload( wiki_category_link_re, category_link_path, batch_size, upload_edge, start=0,