示例#1
0
 def gain_data(self,
               query,
               begin=None,
               end=None,
               language=None,
               start=0,
               nums=0,
               pause=2):
     """first get articles count, then loop pages"""
     init_url = self.req_url(query, begin, end, language, start, pause=2)
     # bsObj = self.Cold_boot(init_url)
     # TotalCount = self.counts_result(bsObj, start)
     pages = int(ceil(nums / 10))
     page = 0
     Allinformations = []
     while page <= pages:
         print(page)
         start = page * 10
         url = self.req_url(query, begin, end, language, start, pause=pause)
         ylog.debug(url)
         bsObj = self.Cold_boot(url)
         info = self.extract(bsObj)
         # print(type(bsObj))
         #info = self.content(bsObj)
         if len(info) == 0:
             break
         Allinformations = Allinformations + info
         page = page + 1
     infos = {
         # 'TotalCount': TotalCount,
         'QueryURL': init_url,
         'Allinformations': Allinformations
     }
     return infos
示例#2
0
    def _get_http_response(self, url, log_msg=None, err_msg=None):
        """
        Helper method, sends HTTP request and returns response payload.
        """
        if log_msg is None:
            log_msg = 'HTTP response data follow'
        if err_msg is None:
            err_msg = 'request failed'
        try:
            ScholarUtils.log('info', 'requesting %s' % unquote(url))

            req = Request(
                url=url,
                proxies=self.proxies,
                headers={'User-Agent': ScholarConf.USER_AGENT})
            hdl = self.opener.open(req)
            html = hdl.read()
            ylog.debug(log_msg)
            ScholarUtils.log('debug', log_msg)
            ScholarUtils.log('debug', '>>>>' + '-' * 68)
            ScholarUtils.log('debug', 'url: %s' % hdl.geturl())
            ScholarUtils.log('debug', 'result: %s' % hdl.getcode())
            ScholarUtils.log('debug', 'headers:\n' + str(hdl.info()))
            ScholarUtils.log('debug',
                             'data:\n' + html.decode('utf-8'))  # For Python 3
            ScholarUtils.log('debug', '<<<<' + '-' * 68)

            return html
        except Exception as err:
            ScholarUtils.log('info', err_msg + ': %s' % err)
            return None
示例#3
0
    def search_relation(self, bsObj, pause=2):

        # RelatedKw = []
        pq_content = self.pq_html(bsObj)
        # ylog.info(pq_content)
        related_str = (str(pq_content))
        related_str_re = re.compile("\"rfs\":\[[^!]+\]")
        try:
            related_str_rfs = related_str_re.search(related_str).group()
        except AttributeError:
            LOGGER.debug(related_str)
            return None
        # ylog.debug(related_str_rfs)
        related_ls_re = re.compile("(:\[|,)(\"[A-Za-z\s\u4e00-\u9fa5]*\")")
        ls_related = related_ls_re.findall(related_str_rfs)
        RelatedKw = [x[1][1:-1] for x in ls_related]
        ylog.debug("related keywords: %s" % RelatedKw)
        # if pq_content is not None:
        #     for item in pq_content('p._Bmc').items():
        #         href = item('a').attr('href')
        #         if href:
        #             o = urlparse(href, 'http')
        #             if o.netloc:
        #                 kw = href
        #             if href.startswith('/search?'):
        #                 href = parse_qs(o.query)['q'][0]
        #                 o = urlparse(href, 'http')
        #                 if o.path:
        #                     kw = href
        #             RelatedKw.append(kw)
        return RelatedKw
def batch_upload(re, file_path, BATCH_SIZE, func, start, end):
    """batch upload categories or page
    Keyword Arguments:
    re         -- regular expression
    source     -- file path
    BATCH_SIZE --
    func       -- upload function
    start      -- start position
    end        -- end position

    """
    # with open(file_path, 'r') as f:
    #     print("reading all lines from sql")
    #     total_line_size = len(f.readlines())
    with open(file_path, 'rb') as f:
        for i, line in enumerate(tqdm(f)):
            line_start_position = 0
            line_end_position = len(line)
            # try to process the whole line in a wile loop until it's done
            while True:
                if i < start:
                    break


#                 elif i <= end:
                try:
                    test_string = line[line_start_position:].decode('utf-8')
                    line_size = len(re.findall(test_string))

                except UnicodeDecodeError as e:
                    line_end_position = e.start
                    ylog.debug('start at %s' % line_end_position)
                finally:
                    string = line[
                        line_start_position:line_end_position].decode('utf-8')
                    line_size = len(re.findall(string))
                    try:
                        last_span = re.search(string).span()[0]
                    except AttributeError:
                        break
                    line_size = len(re.findall(string))
                    for _ in range(0, line_size, BATCH_SIZE):
                        # pause if find a file naed pause at the currend dir
                        re_batch = {}
                        for j in range(BATCH_SIZE):
                            re_batch[j] = re.search(string, last_span)
                            if re_batch[j] is not None:
                                last_span = re_batch[j].span()[1]
                        func(re_batch)
                    line_end_position = len(line)
                    line_start_position = line_end_position + 10
            else:
                break
示例#5
0
def find_meta(title, doi):
    """ find metadata with title or doi
    Keyword Arguments:
    title --
    doi   --
    """
    ylog.info(title)
    works = Works()
    w1 = works.query(title).sort('relevance').order('desc')
    i = 0
    for item in w1:
        i = i + 1
        try:
            t = item.get('title')[0]
            sub_title = item.get('subtitle')[0]
        except:
            continue
        if SequenceMatcher(a=title, b=t).ratio() > 0.9 or SequenceMatcher(
                a=title, b=sub_title).ratio > 0.9:
            return item
        if i > 18:
            ylog.debug('[x]%s' % title)
            # ylog.debug(item['title'])
            return None
def upload_edge(ls_edges):
    """upload edge one by one
    Parameters:
    ls_edges -- list of edge tuples
    """
    len_edges = len(ls_edges)
    uploaded_number = 0
    batch_counter = 0
    for edge_counter in tqdm(range(0, len_edges, BATCH_SIZE)):

        res = None
        error = None
        retry = 0
        graph_upload_request = graphUpload_pb2.GraphUploadRequest()
        while res is None:
            try:
                graph_upload_request = graphUpload_pb2.GraphUploadRequest()
                for e in ls_edges[batch_counter:batch_counter + BATCH_SIZE]:
                    node_from = e[0]
                    node_to = e[1]
                    edge_type = e[2]

                    # page edge
                    if edge_type == 0:
                        edge = graph_upload_request.graph.edges.add()
                        edge.props.type = "HasElement"
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                            node_to)
                # categories edge
                    else:
                        if node_from in IGNORE_CATEGORIES:
                            continue
                        edge = graph_upload_request.graph.edges.add()
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_to)
                        edge.props.type = "HasSubset"
                graph_upload_request.uploadTag = "uploadWikiEdge"
                graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                res = gs_call.upload_graph(graph_upload_request)

            except HTTPError as e:
                if e.code in RETRIABLE_STATUS_CODES:
                    error = 'A retriable HTTP error %d occurred:\n%s' % (
                        e.code, e.reason)
                else:
                    raise
            except RETRIABLE_EXCEPTIONS as e:
                error = 'A retriable error occurred: %s' % e
            except GRAPH_EXCEPTIONS as e:
                ylog.debug('A graph error occurred: %s' % e)
                break
            if error is not None:
                print(error)
                retry += 1
                res = None
                if retry > MAX_RETRIES:
                    ylog.debug(res)
                    exit("no loger attempting to retry.")
                max_sleep = 2**retry
                sleep_seconds = random.random() * max_sleep
                print('Sleeping %f seconds and then retrying...' %
                      sleep_seconds)
                time.sleep(sleep_seconds)
        try:
            if res.edgeUpdateResultStatistics:
                ylog.debug(res.edgeUpdateResultStatistics)
                number = res.edgeUpdateResultStatistics.numOfCreations + \
                    res.edgeUpdateResultStatistics.numOfUpdates + \
                    res.edgeUpdateResultStatistics.numOfSkips
                uploaded_number += number
            if res.failedEdges:
                for err in res.failedEdges:
                    ylog.debug(err)
                    ylog.debug("start node: %s" %
                               err.edge.startNodeID.primaryKeyInDomain)
                    ylog.debug("end node: %s" %
                               err.edge.endNodeID.primaryKeyInDomain)
        except:
            pass
        batch_counter += BATCH_SIZE

    return uploaded_number
示例#7
0
# ylog.debug("test")
batch_size = 2
# test fetch graph
test_url = 'http://192.168.1.166:9080'
prod_url = 'http://q.gftchina.com:13567/vqservice/vq/'
test_user_name = 'wuwei'
test_pwd = 'gft'
gs_call = gftIO.GSCall(test_url, test_user_name, test_pwd)
try:
    graph = gftIO.get_graph_from_neo4j('392482970E904D11190D208B7C22874A',
                                       server_url=test_url,
                                       user_name=test_user_name,
                                       pwd=test_pwd)
except:
    pass

# read sql file
ylog.debug('reading sql files')
# category
category_path = "/home/weiwu/share/deep_learning/data/zhwiki_cat_pg_lk/zhwiki-latest-category.zhs.sql"
category_sql = open(category_path, 'r')
category = category_sql.read()
category_sql.close()
wiki_category_re = re.compile(
    "\(([0-9]+),('[^,]+'),([0-9]+),([0-9]+),([0-9]+)\)")
wiki_category_size = len(wiki_category_re.findall(category))

# TODO: add counter of successful uploaded edges.
ylog.debug('start uploading edges')
last_span = wiki_category_re.search(category).span()[0]
示例#8
0
ylog.console_on()
ylog.filelog_on("app")

works = Works()

title = """Heterogeneous resistance to vancomycin in Staphylococcus epidermidis, Staphylococcus haemolyticus and Staphylococcus warneri clinical strains: characterisation"""
w1 = works.query(title).sort('relevance').order('desc')
i = 0
target_doi = '10.1109/icdcs.2006.48'
items_result = None
for item in w1:
    i = i + 1
    try:
        t = item.get('title')[0]
        sub_title = item.get('subtitle')[0]
        ylog.debug('crossref item title ')
        ylog.debug(t)
        ylog.debug(sub_title)
    except:
        ylog.debug(item)
        continue
    if SequenceMatcher(a=title, b=t).ratio() > 0.8:
        found_doi = item['DOI']
        ylog.debug("target doi: %s" % target_doi)
        ylog.debug("found  doi: %s" % found_doi)
        if target_doi[:10] == found_doi[:10] or SequenceMatcher(
                a=target_doi, b=found_doi).ratio() > 0.9:
            print('found')
            break
    if i > 0:
        ylog.debug('[x]%s' % title)
示例#9
0
文件: main.py 项目: leolle/simulate
def risk_model(df_ret, dict_risk_expo, capital, corr_half_life, var_half_life):
    """
    Regression stock return by previous factor exposure, to get
    factor return covariance and residual.

    Pseudo code:
    1. process input data, parse, drop and fill.
    2. get intersection of all factor names, all symbol names, all dates.
    3. Solve the problem of heteroskedasticity by square root the market capitalization.
    Handbook p5, p15.
    new return = square root of market capitalization * stock return,
    add a constraint column to new return.

    calculate factor return.
    calculate factor return covariance.
    calculate the residual(specific) variances of regression.
    generate final return value.

    Keyword Arguments:
    df_ret           -- pd.DataFrame, stock daily return.
    dict_risk_expo   -- dictionary, factor exposure, key=factor.
    capital          -- pd.DataFrame, stock market capital, to calculate weight.
    corr_half_life   -- int, to compare correlation half life.
    var_half_life    -- int, to compare variance half life.

    Return:
    27 industrial factors + 8 style factors return -- pd.DataFrame
    ret_cov                                        -- pd.DataFrame, return covariance
    specificRisk                                   -- pd.DataFrame, residual
    """

    # get all factor names
    ylog.debug('parse data')
    ls_fexponame = list(
        map(gftIO.gidInt2Str,
            list(dict_risk_expo['osets'].asColumnTab()['O0'])))
    ind_factor_name = sorted(
        list(
            map(gftIO.gidInt2Str,
                list(dict_risk_expo[ls_fexponame[0]].asColumnTab()['O0']))))
    sty_factor_name = sorted(
        list(
            map(gftIO.gidInt2Str,
                list(dict_risk_expo[ls_fexponame[1]].asColumnTab()['O0']))))
    allfactor = ind_factor_name + sty_factor_name

    ##stock return preprocess
    df_w_ret = df_ret.asMatrix().T.dropna(how='all', axis=1)

    ##get factor exposure date list(all snapshots)
    dict_risk_expo_new = {
        factorname: dict_risk_expo[factorname].asMatrix().dropna(how='all')
        for factorname in allfactor
    }
    ls_ls_fexpodate = list([
        dict_risk_expo_new[factorname].index.tolist()
        for factorname in dict_risk_expo_new.keys()
    ])
    ls_alldates_fexpo = reduce(np.intersect1d, ls_ls_fexpodate)

    ## get factor exposure symbol list
    ls_ls_fexposymbol = list([
        dict_risk_expo_new[factorname].columns.tolist()
        for factorname in dict_risk_expo_new.keys()
    ])
    ls_allsymbols_fexpo = reduce(np.intersect1d, ls_ls_fexposymbol)

    ##weight preprocess
    weight = capital.asMatrix().T

    ##get the date/symbol intersection of (stock return,factor exposure,capital)

    ##ls_alldates save the stock return map date

    ##get fexpo date,find the nearest business day

    fexpodate = pd.DataFrame(ls_alldates_fexpo, columns=['date_fexpo'])
    retdate = pd.DataFrame(df_w_ret.columns, columns=['date_ret'])

    retdate.sort_values("date_ret", ascending=True, inplace=True)
    fexpodate.sort_values("date_fexpo", ascending=True, inplace=True)

    df_date_map = pd.merge_asof(retdate,
                                fexpodate,
                                left_on="date_ret",
                                right_on="date_fexpo",
                                allow_exact_matches=False)

    df_date_map.dropna(how='any', inplace=True)
    df_date_map = df_date_map.drop_duplicates(
        subset='date_fexpo').reset_index()
    dict_date_map = {
        df_date_map.date_fexpo[i]: df_date_map.date_ret[i]
        for i in range(len(df_date_map))
    }

    ls_alldates = sorted(
        list(
            set(capital.columns).intersection(set(
                df_w_ret.columns)).intersection(set(dict_date_map.values()))))
    ls_alldates_ondaybefore = sorted(list(dict_date_map.keys()))
    ##get daily symbol list
    ls_allsymbols = {
        date: list(
            set(df_w_ret[[dict_date_map[date]]].dropna().index).intersection(
                set(ls_allsymbols_fexpo)).intersection(set(capital.index)))
        for date in ls_alldates_ondaybefore
    }

    ## align the stock return and factor exposure
    dict_df_capital_raw = {
        date: capital[[date]].reindex(index=ls_allsymbols[date]).fillna(0)
        for date in ls_alldates_ondaybefore
    }
    dict_df_capital = {
        date: np.sqrt(dict_df_capital_raw[date])
        for date in ls_alldates_ondaybefore
    }

    dict_df_ret = {
        dict_date_map[date]:
        pd.concat([(df_w_ret[[dict_date_map[date]
                              ]].reindex(index=ls_allsymbols[date])) *
                   (dict_df_capital[date].rename(
                       columns={date: dict_date_map[date]})),
                   pd.DataFrame(data=np.zeros(1),
                                index=['constrain'],
                                columns=[dict_date_map[date]])],
                  axis=0)
        for date in ls_alldates_ondaybefore
    }
    dict_df_fexpo_raw = {
        date: fexpomerge(dict_risk_expo_new, date, allfactor, ls_allsymbols)
        for date in ls_alldates_ondaybefore
    }
    dict_df_fexpo = {
        date: dict_df_fexpo_raw[date].assign(countryfactor=1).multiply(
            dict_df_capital[date].squeeze(), axis='index')
        for date in ls_alldates_ondaybefore
    }

    ##calculate constraints
    dict_df_fexpo_con = {
        date:
        expoconstrain(dict_df_fexpo_raw, date, ind_factor_name, allfactor,
                      dict_df_capital_raw, sty_factor_name, dict_df_fexpo)
        for date in ls_alldates_ondaybefore
    }

    # for i in dict_risk_expo_new.keys():
    #     if dict_risk_expo_new[i].index.min() > df_l_ret.index.min(
    #     ) or dict_risk_expo_new[i].index.max() < df_l_ret.index.max():
    #         raise Exception

    ########################step3:calculate factor return########################

    ls_df_fitresult = {
        dict_date_map[date]:
        Regression(date, dict_df_ret, dict_df_fexpo_con, dict_df_capital,
                   dict_df_fexpo, dict_date_map)
        for date in ls_alldates_ondaybefore
    }

    ls_df_facreturn = list(
        ls_df_fitresult[date]['params'].rename(columns={'params': date})
        for date in ls_alldates)
    df_model_params = reduce(
        lambda df_para1, df_para2: pd.concat([df_para1, df_para2], axis=1),
        ls_df_facreturn)

    ########################step4:calculate factor return covariance########################

    df_allfactorret = df_model_params.T
    df_allfactorret = df_allfactorret.sort_index()

    corrhalflife = int(corr_half_life)
    varhalflife = int(var_half_life)

    halflife = max(corrhalflife, varhalflife)

    if len(ls_alldates) < halflife:
        raise Exception("More data needed")
    else:
        ls_alldatesnew = ls_alldates[halflife - 1:len(ls_alldates)]
        corrwgts = list(
            map(lambda x: mt.sqrt(0.5**(x / int(corrhalflife))),
                list(range(int(corrhalflife) - 1, -1, -1))))
        varwgts = list(
            map(lambda x: mt.sqrt(0.5**(x / int(varhalflife))),
                list(range(int(varhalflife) - 1, -1, -1))))

        ls_factorretcov = list(
            calcfactorRetCov(df_allfactorret, date, corrwgts, varwgts,
                             corrhalflife, varhalflife)
            for date in ls_alldatesnew)
        df_l_factorretcov = pd.concat(
            ls_factorretcov, axis=0).rename(columns={'variable': 'factorid2'})

        ########################step5:calculate the residual(specific) variances of regression########################

        ##part1:merge factorreturn,factor exposure and stock return
        ls_specificrisk = list(
            ls_df_fitresult[date]['resid'].rename(columns={'resid': date})
            for date in ls_alldates)
        df_w_specificrisk = pd.concat(ls_specificrisk, axis=1).T
        df_w_specificrisk = df_w_specificrisk.sort_index()
        specificwgts = list(
            map(lambda x: mt.sqrt(0.5**(x / int(halflife))),
                list(range(int(halflife) - 1, -1, -1))))

        ls_factorretspe = list(
            calcfactorRetSpe(df_w_specificrisk, date, specificwgts, halflife)
            for date in ls_alldatesnew)
        df_specificrisk_var = pd.concat(ls_factorretspe, axis=0)

        ########################step6:generate final return value########################
        df_allfactorret = df_allfactorret.drop('countryfactor', axis=1)
        dict_factorret = {
            key + '.ret': df_allfactorret[[key]].rename(
                columns={
                    key:
                    list(
                        gftIO.strSet2Np(
                            np.array(list(df_allfactorret[[key]].columns))))[0]
                })
            for key in df_allfactorret.columns
        }
        dictMerged = dict(
            dict_factorret, **{
                'ret_cov': df_l_factorretcov,
                'specificRisk': df_specificrisk_var
            })
        return dictMerged
示例#10
0
import statsmodels.regression.linear_model as lm
import statsmodels.api as sm
from lib.gftTools import gftIO
import datetime
import pandas as pd
import numpy as np
import re
import os
import warnings
from functools import reduce
import math as mt
import statsmodels.api as sm
from lib.gftTools import gftIO
import datetime
import logging
from ylib import ylog

ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("app")

risk_model_path = '/home/weiwu/share/risk_model/'
x0 = gftIO.zload(os.path.join(risk_model_path, 'stock_return.pkl'))
x1 = gftIO.zload(os.path.join(risk_model_path, 'factors.pkl')

x2 = gftIO.zload(os.path.join(risk_model_path, 'market_capital.pkl'))
x3 = 4
x4 = 5

ylog.debug('parse data')
示例#11
0
    def find_meta(self, identifier):
        """ find metadata with title or DOI
        Keyword Arguments:
        identifier --
        """
        try:
            # verify=False is dangerous but sci-hub.io
            # requires intermediate certificates to verify
            # and requests doesn't know how to download them.
            # as a hacky fix, you can add them to your store
            # and verifying would work. will fix this later.
            url = self.base_url + identifier['article_link']
            self.sess.headers = {'user-agent': self.get_random_user_agent()}
            res = self.sess.get(url, verify=False, allow_redirects=False)
            re_bracket = re.compile("\[(.*?)\]\s")
            title = re.sub(re_bracket, "", identifier['name'])
            ylog.debug('*' * 80)
            ylog.debug("title: %s" % title)
            ylog.debug(res.status_code)
            # self.out.ix[title]['status_code'] = res.status_code
            ylog.debug("headers: %s" % res.headers['Content-Type'])
            ylog.debug('location: %s' % res.headers.get("Location"))
            # self.out.ix[title]['location'] = res.headers.get("Location")
            search_title = True
            if not res.headers.get("Location"):
                content = res.content
                if len(content) > 2:
                    import cchardet
                    charset = cchardet.detect(content)
                    text = content.decode(charset['encoding'])
                    soup = BeautifulSoup(text, "lxml")
                    script = soup.script.get_text()
                    doi_regexp = '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+'
                    try:
                        doi_match = re.compile(doi_regexp).findall(script)[0]
                        ylog.info("DOI: %s" % doi_match)
                        search_title = False
                        # use crossref API to get metadata
                        works = Works()
                        w1 = works.query(doi_match).sort('relevance').order(
                            'desc')
                        i = 0
                        for item in w1:
                            # TODO: verify title
                            # self.out.ix[title]['DOI'] = item['DOI']
                            return {'meta': item['DOI'], 'url': url}
                    except IndexError:
                        ylog.debug('failed to find regexp')
            elif search_title:
                works = Works()
                w1 = works.query(title).sort('relevance').order('desc')
                i = 0
                for item in w1:
                    i = i + 1
                    try:
                        # ylog.debug('crossref item title ')
                        t = item.get('title')[0]
                        # ylog.debug(t)
                        sub_title = item.get('subtitle')[0]
                        # ylog.debug(sub_title)
                        # ylog.debug("ratio: %s" %
                        #            (SequenceMatcher(a=title, b=t).ratio()))
                    except TypeError:
                        sub_title = ''
                    if SequenceMatcher(
                            a=title, b=t).ratio() > 0.9 or SequenceMatcher(
                                a=title, b=sub_title).ratio(
                                ) > 0.9 or t.startswith(title):
                        ylog.debug("DOI %s" % item['DOI'])
                        # self.out.ix[title]['DOI'] = item['DOI']
                        return {'meta': item['DOI'], 'url': url}
                    if i > 18:
                        # ylog.debug('[x]%s' % title)
                        # ylog.debug(item['title'])
                        return None

        except requests.exceptions.ConnectionError:
            logger.info('{} cannot acess,changing'.format(
                self.available_base_url_list[0]))
            self._change_base_url()

        except requests.exceptions.RequestException as e:

            return {
                'err':
                'Failed to fetch pdf with identifier %s (resolved url %s) due to request exception.'
                % (identifier, url)
            }
示例#12
0
    from tempfile import gettempdir
    tmp_dir = gettempdir()
    output = open(tmp_dir + '/test.txt', 'w')
    logging.info("extract pages")
    for page_id in ls_pageid:
        logging.info(page_id)
        try:
            text = gs_call.get_nodes_binary_data([page_id])
        except DecodeError:
            continue
        page = text.entries[0].data.data.decode('utf-8')
        text = preprocess_string(page)
        # ylog.debug(text)
        output.write(text + '\n')
    output.close()


if __name__ == '__main__':
    # ylog.set_level(logging.DEBUG)
    # ylog.console_on()
    # ylog.filelog_on("wiki_upload")
    gs_call = gftIO.GSCall(prod_url, test_user_name, test_pwd)
    cat_path = user_path + '/share/deep_learning/data/GID/cat.txt'
    page_path = user_path + "/share/deep_learning/data/GID/page.txt"
    page_gid_file = open(page_path)
    lines = page_gid_file.read().splitlines()
    page_gid = [s.strip() for s in lines]
    for gid in page_gid:
        ylog.debug(gid)
        extract_pages(gid, gs_call)
示例#13
0
def upload_node(dict_re_match_object):
    """ upload regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill node properties.
    use encoded original Chinese title plus url as url property.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    Keyword Arguments:
    re_match_object -- re object
    """
    res = None
    error = None
    re_upload_error = None
    retry = 0
    nodes_fail_retry = 0
    uploaded_number = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            # iterate nodes batch
            for index, value in dict_re_match_object.items():
                if value is not None:
                    item = dict_re_match_object.get(index)
                    # print(item)
                    title = item.group()[1:-1]
                    zh_title = HanziConv.toSimplified(title)
                    # if zh_title in IGNORE_CATEGORIES:
                    #     break
                    node = graph_upload_request.graph.nodes.add()
                    node.props.type = "readonlyDoc"
                    # p1 = node.props.props.entries.add()
                    # p1.key = "url"
                    # p1.value = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus(
                    #     title)
                    p2 = node.props.props.entries.add()
                    p2.key = "_s_import_source"
                    p2.value = "word2vec model"

                    node.businessID.url = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus(
                        title)
                    node.names.chinese = zh_title

            # other information of the upload request
            graph_upload_request.uploadTag = "UploadWord2VecVocabNodes"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        try:
            if res.failedNodes:
                re_upload_error = "some nodes failed to upload %s" % res.failedNodeds
        except:
            pass
        if re_upload_error is not None:
            print(re_upload_error)
            nodes_fail_retry += 1
            res = None
            if nodes_fail_retry > NODES_FAIL_MAX_RETRIES:
                ylog.debug(res)
                res = "continue"

        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                # break
                # exit("no loger attempting to retry.")
            ylog.debug(res)
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    # ylog.debug(res)
    # jump out while response is None:
    try:
        if res.nodeUpdateResultStatistics:
            ylog.debug(res.nodeUpdateResultStatistics)
            uploaded_number = res.nodeUpdateResultStatistics.numOfCreations + \
                res.nodeUpdateResultStatistics.numOfUpdates + \
                res.nodeUpdateResultStatistics.numOfSkips
        if res.uploadedNodes:
            for updated in res.uploadedNodes:
                ylog.debug("uploaded node GID: %s" % updated.gid)
        if res.failedNodes:
            for err in res.failedNodes:
                if err.error.errorCode != 202001:
                    ylog.info(err.error)
                    ylog.debug(err.error)
    except:
        pass

    return uploaded_number
示例#14
0
def delete_edge(dict_re_match_object):
    """ delete edge regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill edge properties.
    set edge start node and end node.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page')
    Keyword Arguments:
    re_match_object -- re object
    """
    uploaded_number = 0
    for index, value in dict_re_match_object.items():
        if value is not None:
            item = dict_re_match_object.get(index)
            edge_type = item.group(7)[1:-1]
            del_edge_type = None
            if edge_type == 'page':
                page_title = item.group(3)[1:-1]
                cat_title = item.group(2)[1:-1]
                if '\\n' in cat_title:
                    end = cat_title.split("\\n")
                    cat_title = end[-1]
                if '\\n' in page_title:
                    end = page_title.split("\\n")
                    page_title = end[-1]
                page_title = page_title.replace(" ", "_")

                startNodeID_domain = "https://zh.wikipedia.org/wiki/Category:"
                startNodeID_primaryKeyInDomain = cat_title

                endNodeID_domain = "https://zh.wikipedia.org/wiki/"
                endNodeID_primaryKeyInDomain = page_title

                del_edge_type = "HasElement"

            if edge_type == 'subcat':
                subcat_title = item.group(3)[1:-1]
                cat_title = item.group(2)[1:-1]
                if '\\n' in cat_title:
                    end = cat_title.split("\\n")
                    cat_title = end[-1]
                if '\\n' in subcat_title:
                    end = subcat_title.split("\\n")
                    subcat_title = end[-1]
                subcat_title = subcat_title.replace(" ", "_")
                subcat_title_zh = HanziConv.toSimplified(subcat_title)
                cat_title_zh = HanziConv.toSimplified(cat_title)
                startNodeID_domain = "https://zh.wikipedia.org/wiki/Category:"
                startNodeID_primaryKeyInDomain = cat_title
                endNodeID_domain = "https://zh.wikipedia.org/wiki/Category:"
                endNodeID_primaryKeyInDomain = subcat_title
                del_edge_type = "HasSubset"
            if del_edge_type is not None:
                start_node_pk = startNodeID_domain + "/" + startNodeID_primaryKeyInDomain
                end_node_pk = endNodeID_domain + "/" + endNodeID_primaryKeyInDomain
                start_node_hash = hashlib.md5(
                    start_node_pk.encode('utf-8')).hexdigest().upper()
                end_node_hash = hashlib.md5(
                    end_node_pk.encode('utf-8')).hexdigest().upper()
                get_or_else = ""
                get_source = ""
                get_target = ""
                edge_str = "|".join([
                    start_node_hash, end_node_hash, del_edge_type, get_or_else,
                    get_source, get_target
                ])
                edge_md5 = hashlib.md5(
                    edge_str.encode('utf-8')).hexdigest().upper()
                del_edge_type = None
                res = None
                error = None
                retry = 0
                while res is None:
                    try:
                        res = gs_call.delete_edge(edge_md5, False)
                    except GSError as e:
                        # error = 'edge not existed'
                        res = 'failed'
                        ylog.debug('failed  %s from %s to %s' %
                                   (edge_md5, start_node_hash, end_node_hash))
                    except HTTPError as e:
                        if e.code in RETRIABLE_STATUS_CODES:
                            error = 'A retriable HTTP error %d occurred:\n%s' % (
                                e.code, e.reason)
                        else:
                            raise
                    else:
                        res = 'success'
                        ylog.debug('deleted %s from %s to %s' %
                                   (edge_md5, start_node_hash, end_node_hash))

                    if error is not None:
                        print(error)
                        retry += 1
                        # res = None
                        if retry > MAX_RETRIES:
                            ylog.debug(res)
                            exit("no loger attempting to retry.")
                        max_sleep = 2**retry
                        sleep_seconds = random.random() * max_sleep
                        print('Sleeping %f seconds and then retrying...' %
                              sleep_seconds)
                        time.sleep(sleep_seconds)
                    if res == 'success':
                        uploaded_number += 1
                        ylog.debug('deleted %s from %s to %s' %
                                   (edge_md5, start_node_hash, end_node_hash))

    return uploaded_number
示例#15
0
def upload_edge(dict_re_match_object):
    """ upload edge regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill edge properties.
    set edge start node and end node.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page')
    (id, from, to,...)
    Keyword Arguments:
    re_match_object -- re object
    """
    res = None
    error = None
    re_upload_error = None
    retry = 0
    nodes_fail_retry = 0
    uploaded_number = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            # iterate nodes batch
            for index, value in dict_re_match_object.items():
                if value is not None:
                    item = dict_re_match_object.get(index)
                    edge_type = item.group(7)[1:-1]
                    if edge_type == 'page':
                        page_title = item.group(3)[1:-1]
                        cat_title = item.group(2)[1:-1]
                        if '\\n' in cat_title:
                            end = cat_title.split("\\n")
                            cat_title = end[-1]
                        if '\\n' in page_title:
                            end = page_title.split("\\n")
                            page_title = end[-1]
                        page_title = page_title.replace(" ", "_")
                        page_title_zh = HanziConv.toSimplified(page_title)
                        cat_title_zh = HanziConv.toSimplified(cat_title)
                        # if not cat_title_zh in EXAMPLE_CATEGORIES_PAGE_DICT:
                        #     continue

                        edge = graph_upload_request.graph.edges.add()
                        edge.props.type = "HasElement"
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            cat_title)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                            page_title)

                    if edge_type == 'subcat':
                        subcat_title = item.group(3)[1:-1]
                        cat_title = item.group(2)[1:-1]
                        if '\\n' in cat_title:
                            end = cat_title.split("\\n")
                            cat_title = end[-1]
                        if '\\n' in subcat_title:
                            end = subcat_title.split("\\n")
                            subcat_title = end[-1]
                        subcat_title = subcat_title.replace(" ", "_")
                        subcat_title_zh = HanziConv.toSimplified(subcat_title)
                        cat_title_zh = HanziConv.toSimplified(cat_title)

                        # if not cat_title_zh in EXAMPLE_CATEGORIES_PAGE_DICT:
                        #     continue
                        if subcat_title_zh == cat_title_zh:
                            continue
                        edge = graph_upload_request.graph.edges.add()
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            cat_title)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            subcat_title)
                        edge.props.type = "HasSubset"

            graph_upload_request.uploadTag = "uploadWikiEdge"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
            # ylog.debug(res)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        except GRAPH_EXCEPTIONS as e:
            break
        # try:
        #     if res.failedEdges:
        #         re_upload_error = "some nodes failed to upload %s" % res.failedEdges
        # except:
        #     pass
        # if re_upload_error is not None:
        #     print(re_upload_error)
        #     nodes_fail_retry += 1
        #     res = None
        #     if nodes_fail_retry > NODES_FAIL_MAX_RETRIES:
        #         ylog.debug(res)
        #         res = "continue"
        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                ylog.debug("no loger attempting to retry.")
                error = None
                # exit("no loger attempting to retry.")
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    try:
        if res.edgeUpdateResultStatistics:
            ylog.debug(res.edgeUpdateResultStatistics)
            uploaded_number = res.edgeUpdateResultStatistics.numOfCreations + \
                res.edgeUpdateResultStatistics.numOfUpdates + \
                res.edgeUpdateResultStatistics.numOfSkips
        if res.failedEdges:
            for err in res.failedEdges:
                ylog.debug(err)
                ylog.debug("start node: %s" %
                           err.edge.startNodeID.primaryKeyInDomain)
                ylog.debug("end node: %s" %
                           err.edge.endNodeID.primaryKeyInDomain)
    except:
        pass

    return uploaded_number
    ls_edges = []
    for e in tqdm(graph.edges):
        node_from = e[0]
        node_to = e[1]
        edge_type = graph[node_from][node_to]['subtype']
        ls_edges.append(tuple([node_from, node_to, edge_type]))
    import pickle

    with open('graph_whole.pkl', 'wb') as fp:
        pickle.dump(ls_edges, fp)
    return ls_edges


wiki_category_re = re.compile(
    "\(([0-9]+),('[^,]+'),([0-9]+),([0-9]+),([0-9]+)\)")
ylog.debug('create graph nodes')
batch_upload(wiki_category_re,
             category_path,
             200,
             add_node,
             start=0,
             end=10000000)

ylog.debug('create graph edges')
batch_upload(wiki_category_link_re,
             category_link_path,
             200,
             add_edge,
             start=0,
             end=10000000)
d = defaultdict(list)
示例#17
0
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)
    if options.count is not None:
        options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)
        query.set_num_page_results(options.count)

    querier.send_query(query)

    if options.csv:
        csv(querier)
    elif options.csv_header:
        csv(querier, header=True)
    elif options.citation is not None:
        citation_export(querier)
    else:
        txt(querier, with_globals=options.txt_globals)

    if options.cookie_file:
        querier.save_cookies()

    return 0


if __name__ == "__main__":
    ylog.debug('start')
    sys.exit(main())
    # main([
    #     '-c', '1', '--author', "albert einstein", '--phrase', 'quantum theory'
    # ])
示例#18
0
# test fetch graph
test_url = 'http://192.168.1.166:9080'
prod_url = 'http://q.gftchina.com:13567/vqservice/vq/'
test_user_name = 'wuwei'
test_pwd = 'gft'
gs_call = gftIO.GSCall(test_url, test_user_name, test_pwd)
try:
    graph = gftIO.get_graph_from_neo4j('392482970E904D11190D208B7C22874A',
                                       server_url=test_url,
                                       user_name=test_user_name,
                                       pwd=test_pwd)
except:
    pass

# read sql file
ylog.debug('reading sql files')
# category
category_path = "/home/weiwu/share/deep_learning/data/zhwiki_cat_pg_lk/zhwiki-latest-category.zhs.sql"
category_sql = open(category_path, 'r')
category = category_sql.read()
category_sql.close()
wiki_category_re = re.compile(
    "\(([0-9]+),('[^,]+'),([0-9]+),([0-9]+),([0-9]+)\)")
wiki_category = wiki_category_re.findall(category)

# page
page_path = "/home/weiwu/share/deep_learning/data/zhwiki_cat_pg_lk/zhwiki-latest-page.zhs.sql"
page_sql = open(page_path, 'r')
page = page_sql.read()
page_sql.close()
wiki_page_re = re.compile(
示例#19
0
def main():
    usage = """scholar.py [options] <query string>
    A command-line interface to Google Scholar.
    Examples:
    # Retrieve one article written by Einstein on quantum theory:
    scholar.py -c 1 --author "albert einstein" --phrase "quantum theory"
    # Retrieve a BibTeX entry for that quantum theory paper:
    scholar.py -c 1 -C 17749203648027613321 --citation bt
    # Retrieve five articles written by Einstein after 1970 where the title
    # does not contain the words "quantum" and "theory":
    scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(
        parser, 'Query arguments',
        'These options define search query arguments and parameters.')
    group.add_option(
        '-a',
        '--author',
        metavar='AUTHORS',
        default=None,
        help='Author name(s)')
    group.add_option(
        '-A',
        '--all',
        metavar='WORDS',
        default=None,
        dest='allw',
        help='Results must contain all of these words')
    group.add_option(
        '-s',
        '--some',
        metavar='WORDS',
        default=None,
        help=
        'Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases'
    )
    group.add_option(
        '-n',
        '--none',
        metavar='WORDS',
        default=None,
        help=
        'Results must contain none of these words. See -s|--some re. formatting'
    )
    group.add_option(
        '-p',
        '--phrase',
        metavar='PHRASE',
        default=None,
        help='Results must contain exact phrase')
    group.add_option(
        '-t',
        '--title-only',
        action='store_true',
        default=False,
        help='Search title only')
    group.add_option(
        '-P',
        '--pub',
        metavar='PUBLICATIONS',
        default=None,
        help='Results must have appeared in this publication')
    group.add_option(
        '--after',
        metavar='YEAR',
        default=None,
        help='Results must have appeared in or after given year')
    group.add_option(
        '--before',
        metavar='YEAR',
        default=None,
        help='Results must have appeared in or before given year')
    group.add_option(
        '--no-patents',
        action='store_true',
        default=False,
        help='Do not include patents in results')
    group.add_option(
        '--no-citations',
        action='store_true',
        default=False,
        help='Do not include citations in results')
    group.add_option(
        '-C',
        '--cluster-id',
        metavar='CLUSTER_ID',
        default=None,
        help='Do not search, just use articles in given cluster ID')
    group.add_option(
        '-c',
        '--count',
        type='int',
        default=None,
        help='Maximum number of results')
    parser.add_option_group(group)

    group = optparse.OptionGroup(
        parser, 'Output format',
        'These options control the appearance of the results.')
    group.add_option(
        '--txt',
        action='store_true',
        help='Print article data in text format (default)')
    group.add_option(
        '--txt-globals',
        action='store_true',
        help='Like --txt, but first print global results too')
    group.add_option(
        '--csv',
        action='store_true',
        help='Print article data in CSV form (separator is "|")')
    group.add_option(
        '--csv-header',
        action='store_true',
        help='Like --csv, but print header with column names')
    group.add_option(
        '--citation',
        metavar='FORMAT',
        default=None,
        help=
        'Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).'
    )
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option(
        '--cookie-file',
        metavar='FILE',
        default=None,
        help=
        'File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.'
    )
    group.add_option(
        '-d',
        '--debug',
        action='count',
        default=0,
        help=
        'Enable verbose logging to stderr. Repeated options increase detail of debug output.'
    )
    group.add_option(
        '-v',
        '--version',
        action='store_true',
        default=False,
        help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()
    ylog.debug(options)
    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug'])
        ScholarConf.LOG_LEVEL = options.debug
        ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is scholar.py %s.' % ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print(
                'Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = ScholarQuerier()
    settings = ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print(
            'Invalid citation link format, must be one of "bt", "en", "rm", or "rw".'
        )
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)
    if options.count is not None:
        options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)
        query.set_num_page_results(options.count)

    querier.send_query(query)

    if options.csv:
        csv(querier)
    elif options.csv_header:
        csv(querier, header=True)
    elif options.citation is not None:
        citation_export(querier)
    else:
        txt(querier, with_globals=options.txt_globals)

    if options.cookie_file:
        querier.save_cookies()

    return 0
示例#20
0
def upload_edge_from_graph(ls_edges, batch_size):
    """ upload edge regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill edge properties.
    set edge start node and end node.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page')
    (id, from, to,...)
    Keyword Arguments:
    re_match_object -- re object
    """
    """upload edge one by one
    Parameters:
    ls_edges -- list of edge tuples
    """
    len_edges = len(ls_edges)
    uploaded_number = 0
    batch_counter = 0
    for edge_counter in tqdm(range(0, len_edges, batch_size)):

        res = None
        error = None
        re_upload_error = None
        retry = 0
        nodes_fail_retry = 0
        graph_upload_request = graphUpload_pb2.GraphUploadRequest()
        while res is None:
            try:
                graph_upload_request = graphUpload_pb2.GraphUploadRequest()
                for e in ls_edges[batch_counter:batch_counter + batch_size]:
                    node_from = e[0]
                    node_to = e[1]
                    edge_type = e[2]

                    # page edge
                    if edge_type == 0:
                        edge = graph_upload_request.graph.edges.add()
                        edge.props.type = "HasElement"
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                            node_to)
                # categories edge
                    else:
                        if node_from in IGNORE_CATEGORIES:
                            continue
                        edge = graph_upload_request.graph.edges.add()
                        edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_from)
                        edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                            node_to)
                        edge.props.type = "HasSubset"
                graph_upload_request.uploadTag = "uploadWikiEdge"
                graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                    'UPDATE')
                res = gs_call.upload_graph(graph_upload_request)

            except HTTPError as e:
                if e.code in RETRIABLE_STATUS_CODES:
                    error = 'A retriable HTTP error %d occurred:\n%s' % (
                        e.code, e.reason)
                else:
                    raise
            except RETRIABLE_EXCEPTIONS as e:
                error = 'A retriable error occurred: %s' % e
            except GRAPH_EXCEPTIONS as e:
                ylog.debug('A graph error occurred: %s' % e)
                break
            if error is not None:
                print(error)
                retry += 1
                res = None
                if retry > MAX_RETRIES:
                    ylog.debug(res)
                    # exit("no loger attempting to retry.")
                    ylog.debug("no loger attempting to retry.")
                    error = None
                max_sleep = 2**retry
                sleep_seconds = random.random() * max_sleep
                print('Sleeping %f seconds and then retrying...' %
                      sleep_seconds)
                time.sleep(sleep_seconds)
        try:
            if res.edgeUpdateResultStatistics:
                ylog.debug(res.edgeUpdateResultStatistics)
                number = res.edgeUpdateResultStatistics.numOfCreations + \
                    res.edgeUpdateResultStatistics.numOfUpdates + \
                    res.edgeUpdateResultStatistics.numOfSkips
                uploaded_number += number
            if res.failedEdges:
                for err in res.failedEdges:
                    print(err)
                    print("start node: %s" %
                          err.edge.startNodeID.primaryKeyInDomain)
                    print("end node: %s" %
                          err.edge.endNodeID.primaryKeyInDomain)
        except:
            pass
        batch_counter += batch_size

    return uploaded_number
示例#21
0
from tqdm import tqdm
import time
import json
import networkx as nx

ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("cycles")
graph = nx.read_gexf('whole_edges.no_loops.gexf')
ls_nodes = list(graph.nodes)
counter = 0
total_nodes_num = 287966
rm_counter = 0
try:
    while True:
        ylog.debug('rm cycles loops number %s' % counter)

        for node in tqdm(ls_nodes):
            removed_counter = 0
            ylog.debug('rm cycles of node %s' % node)

            while True:
                try:
                    ls_loop = nx.find_cycle(graph, node)
                    # remove direct edge:
                    ylog.debug(ls_loop)
                    if len(ls_loop) == 2:
                        if ls_loop[0][0] == ls_loop[1][1] and ls_loop[0][
                                1] == ls_loop[1][0]:
                            graph.remove_edge(ls_loop[0][0], ls_loop[0][1])
                    # remove big loop:
示例#22
0
def batch_upload(re, file_path, batch_size, func, start, end):
    """batch upload categories or edge.
    1. read sql file line by line.
    2. extract target string using regular expression.
    3. put these target content into a dictionary.
    4. use the upload function with dict as input.
    Keyword Arguments:
    re         -- regular expression
    source     -- file path
    batch_size --
    func       -- upload function
    start      -- start position
    end        -- end position

    """
    uploaded_number = 0
    try:
        # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe5 in position 7629: invalid continuation byte

        with open(file_path, 'rb') as f:
            for i, line in enumerate(tqdm(f)):
                line_start_position = 0
                line_end_position = len(line)
                # try to process the whole line in a wile loop until it's done
                while True:
                    if i < start:
                        break

            #                 elif i <= end:
                    try:
                        test_string = line[line_start_position:].decode(
                            'utf-8')
                        line_size = len(re.findall(test_string))

                    except UnicodeDecodeError as e:
                        line_end_position = e.start
                        ylog.debug('start at %s' % line_end_position)
                    finally:
                        string = line[line_start_position:
                                      line_end_position].decode('utf-8')
                        line_size = len(re.findall(string))
                        try:
                            last_span = re.search(string).span()[0]
                        except AttributeError:
                            break
                        line_size = len(re.findall(string))
                        for _ in range(0, line_size, batch_size):
                            # pause if find a file naed pause at the currend dir
                            re_batch = {}
                            for j in range(batch_size):
                                re_batch[j] = re.search(string, last_span)
                                if re_batch[j] is not None:
                                    last_span = re_batch[j].span()[1]
                            uploaded_count = func(re_batch)
                            uploaded_number += uploaded_count
                        line_end_position = len(line)
                        line_start_position = line_end_position + 10
                else:
                    break
    except KeyboardInterrupt:
        print("uploaded number: %s" % uploaded_number)
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)
    return uploaded_number
示例#23
0
                         query=quote_plus(query),
                         language=language,
                         start=start)
        requests.packages.urllib3.disable_warnings(
            requests.packages.urllib3.exceptions.InsecureRequestWarning)
        r = requests.get(url=url,
                         proxies=proxies,
                         headers=headers,
                         allow_redirects=False,
                         verify=False,
                         timeout=30)
        time.sleep(5)
    except requests.exceptions.SSLError as e:
        print(e)
        # LOGGER.info(url)
        ylog.debug(domain)
        time.sleep(5)
        continue
LOGGER.info(url)
content = r.content
charset = cchardet.detect(content)
text = content.decode(charset['encoding'])
bsObj = BeautifulSoup(text, "lxml")

# result counts
brief_counts = bsObj.find_all('div', id='gs_ab_md')[0].text
print(brief_counts)
text1 = brief_counts.replace(r',', "")
pattern = re.compile(u'\d+')
result_count = re.findall(pattern, text1)[0]
print(result_count)
示例#24
0
def upload_single_edge(e):
    res = None
    error = None
    retry = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            node_from = e[0]
            node_to = e[1]
            edge_type = e[2]

            if edge_type == 0:
                edge = graph_upload_request.graph.edges.add()
                edge.props.type = "HasElement"
                edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                    node_from)
                edge.endNodeID.url = "https://zh.wikipedia.org/wiki/" + quote_plus(
                    node_to)
            # categories edge
            else:
                if node_from in IGNORE_CATEGORIES:
                    break
                edge = graph_upload_request.graph.edges.add()
                edge.startNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                    node_from)
                edge.endNodeID.url = "https://zh.wikipedia.org/wiki/Category:" + quote_plus(
                    node_to)
                edge.props.type = "HasSubset"
            graph_upload_request.uploadTag = "uploadWikiEdge"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
            print(res)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        except GRAPH_EXCEPTIONS as e:
            break

        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                exit("no loger attempting to retry.")
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    try:
        if res.edgeUpdateResultStatistics:
            ylog.debug(res.edgeUpdateResultStatistics)
            uploaded_number = res.edgeUpdateResultStatistics.numOfCreations + \
                res.edgeUpdateResultStatistics.numOfUpdates + \
                res.edgeUpdateResultStatistics.numOfSkips
            ylog.debug(e)
        # if res.failedEdges:
        #     for err in res.failedEdges:
        #         print(err)
        #         print(
        #             "start node: %s" % err.edge.startNodeID.primaryKeyInDomain)
        #         print("end node: %s" % err.edge.endNodeID.primaryKeyInDomain)
    except:
        pass
示例#25
0
文件: main.py 项目: leolle/simulate
    logger.setLevel(logging.DEBUG)

risk_model_path = '/home/weiwu/share/risk_model/'

# keep from double loading
stock_return = gftIO.zload(os.path.join(risk_model_path, 'stock_return.pkl'))
factors = gftIO.zload(os.path.join(risk_model_path, 'factors.pkl'))
market_capital = gftIO.zload(
    os.path.join(risk_model_path, 'market_capital.pkl'))
corr_half_life = gftIO.zload(
    os.path.join(risk_model_path, 'corr_half_life.pkl'))
var_half_life = gftIO.zload(os.path.join(risk_model_path, 'var_half_life.pkl'))

model = risk_model(stock_return, factors, market_capital, corr_half_life,
                   var_half_life)
ylog.debug('parse data')

# get all factor names
ls_fexponame = factors['osets'].asColumnTab()['O0'].apply(
    gftIO.gidInt2Str).tolist()
ind_factor_name = factors[ls_fexponame[0]].asColumnTab()['O0'].apply(
    gftIO.gidInt2Str).tolist()
style_factor_name = factors[ls_fexponame[1]].asColumnTab()['O0'].apply(
    gftIO.gidInt2Str).tolist()

allfactor = ind_factor_name + style_factor_name

##stock return preprocess
if isinstance(stock_return, gftIO.GftTable):
    # df_w_ret = stock_return.asMatrix().T.dropna(how='all', axis=1)
    df_stock_return = stock_return.asMatrix().dropna(axis=1, how='all')
示例#26
0
    def search(self, query, limit=10, download=False):
        """
        Performs a query on scholar.google.com, and returns a dictionary
        of results in the form {'papers': ...}. Unfortunately, as of now,
        captchas can potentially prevent searches after a certain limit.
        """
        start = 0
        results = {'papers': []}

        while True:
            try:
                self.sess.headers = {'user-agent': self.get_random_user_agent()}
                res = self.sess.get(
                    SCHOLARS_BASE_URL,
                    allow_redirects=True,
                    params={
                        'q': query,
                        'hl': 'en',
                        'start': start,
                        'as_sdt': '0,5'
                    })
                ylog.debug(res.url)
            except requests.exceptions.RequestException as e:
                results[
                    'err'] = 'Failed to complete search with query %s (connection error)' % query
                return results

            s = self._get_soup(res.content)
            papers = s.find_all('div', class_="gs_r")

            if not papers:
                if 'CaptchaRedirect' in res.content:
                    results[
                        'err'] = 'Failed to complete search with query %s (captcha)' % query
                return results

            for paper in papers:
                if not paper.find('table'):
                    source = None
                    pdf = paper.find('div', class_='gs_ggs gs_fl')
                    link = paper.find('h3', class_='gs_rt')
                    # find link type,
                    try:
                        url_type = paper.find(
                            'span', class_='gs_ctg2').get_text()[1:-1]
                    except:
                        url_type = None

                    if pdf:
                        source = pdf.find('a')['href']
                    elif link.find('a'):
                        source = link.find('a')['href']
                    else:
                        continue
                    article_link = link.find('a')['href']
                    results['papers'].append({
                        'name':
                        re.sub(self.re_bracket, "",
                               link.text.replace("\xa0…", "")),
                        'url':
                        source,
                        'article_link':
                        article_link,
                        'type':
                        url_type
                    })

                    if len(results['papers']) >= limit:
                        return results

            start += 10
示例#27
0
    uploaded_number = batch_upload(
        wiki_category_re,
        category_path,
        batch_size,
        upload_cat_node,
        start=0,
        end=6080000000)
    print("uploaded number: %s" % (uploaded_number))

    # upload edge

    # ylog.debug('reading link sql file')

    # with open("graph_no_loop.pkl", 'rb') as fp:
    #     itemlist = pickle.load(fp)
    ylog.debug("uploading wiki categorie page link")
    category_link_path = './data/zhwiki-latest-categorylinks.zhs.sql'
    wiki_category_link_re = re.compile(
        "\(([0-9]+),('[^,]+'),('[^']+'),('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'),('[^']*'),('[^,]+'),('[^,]+')\)"
    )

    # for i in tqdm(itemlist[5308253:]):
    #     upload_single_edge(i)
    # uploaded_number = upload_edge_from_graph(itemlist[int(sys.argv[2]):],
    #                                          int(sys.argv[1]))
    uploaded_number = batch_upload(
        wiki_category_link_re,
        category_link_path,
        batch_size,
        upload_edge,
        start=0,