示例#1
0
def task_validator(project, sstr, endwith, endtime):  # 是否提前结束
    if stats.c_skipped_pmid >= config.pmid_max_c_skip and endwith:  # 设定提前完成并达到提前完成的skip数量
        msg.msg("crawl pmid", project + sstr, "repeat end",
                "succ", "notice", msg.display, msg.log)
        return False  # false是不通过
    if endtime < ut.time_str("full") and endwith:  # 设定提前完成并达到提前完成的时间上限
        msg.msg("crawl pmid", project + sstr, "time end",
                "succ", "notice", msg.display, msg.log)
        return False
    else:
        return True  # True是通过
示例#2
0
def journal_name_wash(journal_name_raw):  # 原始名称清洗(主要针对各种括号和标点、解释、注释)
    re_bracket = re.compile("[\\[\\(](.*?)[\\]\\)]")  # 去处括号解释
    re_explaination = re.compile(" ??[:=].*")  # 去处冒号后的解释
    journal_name = journal_name_raw.replace('&amp;',
                                            "&").replace(',', '').replace(
                                                ".", '')  # &是部分名称中包含的
    journal_name = re_bracket.sub('', journal_name)
    journal_name = re_explaination.sub('', journal_name)
    journal_name = journal_name.upper()  # 清洗过的名称全大写
    msg.msg("journal name", journal_name_raw, "washed", journal_name, "debug",
            msg.display)
    return journal_name
示例#3
0
def parse_url(project, sstr="key_words"):  # 把keyword变成链接形式,临时这样,未来增加内容
    sstr_type = mh.read_sstr_type(project, sstr)
    if sstr_type == "key_words" or sstr_type == "key_word":
        if "," in sstr:
            sstr = sstr.replace(", ", ",").replace(" ,", ",")  # 防止有空格
        elif " " in sstr:
            sstr = sstr.replace(" ", ",")
        sstr = sstr.replace(",", "%2C")  # 换成链接形式
    if sstr_type == "expression":
        pass
    url = "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + sstr  # 最初的查询网址
    msg.msg("url", sstr, "parsed", url, "debug", msg.display)
    return url
示例#4
0
def journal_detail(journal_name):
    record = mh.read_journal_detail(journal_name)  # 直接试一下
    if record:
        msg.msg("journal record", journal_name, "local retrieved", "succ",
                "debug", msg.display)
        return record
    else:
        wjournal_name = journal_name_wash(journal_name)  # 清洗过的在正式名里试一下
        record = mh.read_ojournal_detail(wjournal_name)
        if record:
            msg.msg("journal record", journal_name, "local retrieved", "succ",
                    "debug", msg.display)
            return record
        else:
            ojournal_name = get_official_name(wjournal_name)  # 网络正式名在正式名里试一下
            record = mh.read_ojournal_detail(ojournal_name)
            if record:
                msg.msg("journal record", journal_name, "web retrieved",
                        "succ", "debug", msg.display)
                return record
            else:
                journal_info = get_journal_info(ojournal_name)  # 网络正式名在网络查一下
                journal_if = journal_info[0]
                journal_zone = journal_info[1]
                mh.add_journal(journal_name, ojournal_name, journal_if,
                               journal_zone)  # 新杂志储存
                msg.msg("journal record", journal_name, "web retrieved",
                        "succ", "debug", msg.display)
                data = journal_name, ojournal_name, journal_if, journal_zone
                return data
示例#5
0
def run_task(project, sstr):  # 多少时间后开始运行
    record_number, mrmins, endwith = get_task_config(project, sstr)
    endtime = ut.time_str("full", mrmins)
    msg.msg("crawl pmid", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat)
    pc.run_pmid_crawler(project, sstr, record_number, endwith, endtime)
    msg.msg("crawl pmid", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat)

    msg.msg("crawl detail", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat)
    dc.run_detail_crawler(project, sstr, record_number)
    msg.msg("crawl detail", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat)
示例#6
0
def get_data(filename, drop_data=False):
    '''Get pandas.DataFrame from .csv or .csv.zip'''
    msg('Reading datafile : {}'.format(filename))
    if not os.path.exists(filename):
        raise IOError('No filename: {}'.format(filename))
    if filename.endswith('.zip'):
        z = zipfile.ZipFile(filename)
        filename = z.open(filename.replace('.zip', ''))
    data = pandas.read_csv(
        filename,
        parse_dates=['Dates'], infer_datetime_format=True,
        comment='#',
    )
    #data.Time = data.Dates.map(lambda x: x.time())
    if drop_data:
        data = data[(data.X < -121) & (data.Y < 40)]
        data = data.dropna()
        data = data.reset_index(drop=True)
    msg('Read datafile : {}'.format(filename), 2)
    return data
示例#7
0
def get_data(filename, drop_data=False):
    '''Get pandas.DataFrame from .csv or .csv.zip'''
    msg('Reading datafile : {}'.format(filename))
    if not os.path.exists(filename):
        raise IOError('No filename: {}'.format(filename))
    if filename.endswith('.zip'):
        z = zipfile.ZipFile(filename)
        filename = z.open(filename.replace('.zip', ''))
    data = pandas.read_csv(
        filename,
        parse_dates=['Dates'],
        infer_datetime_format=True,
        comment='#',
    )
    #data.Time = data.Dates.map(lambda x: x.time())
    if drop_data:
        data = data[(data.X < -121) & (data.Y < 40)]
        data = data.dropna()
        data = data.reset_index(drop=True)
    msg('Read datafile : {}'.format(filename), 2)
    return data
示例#8
0
        def draw(self):

            if self.perfect:
                msg(self.scr,
                    self.message,
                    wiggle(self.pos, 5 * self.size),
                    self.color,
                    self.fontsize * wiggle(self.size, 1, -2) * 1.2,
                    centered=True,
                    **self.kwargs)

                msg(self.scr,
                    self.message,
                    wiggle(self.pos, 5 * self.size),
                    self.color,
                    self.fontsize * wiggle(self.size, 1, -2) * 1.2,
                    centered=True,
                    **self.kwargs)
            else:
                msg(self.scr,
                    self.message,
                    self.pos,
                    self.color,
                    self.fontsize * self.size,
                    centered=True,
                    **self.kwargs)
示例#9
0
def get_official_name(journal_name_raw,
                      proxy=None):  # 查找杂志的全名,支持模糊查询,只输出最符合的那个
    url = "http://www.letpub.com.cn/journalappAjax.php?querytype=autojournal&term=" + \
        journal_name_raw.replace("&", "%26").replace(" ", "+")
    tries = config.request_dp_tries
    while tries > 0:
        try:
            opener = requests.Session()
            doc = opener.get(url, timeout=20, headers=agents.get_header()).text
            list = doc.split('},{')  # 获取列表,但是只有最match的被采纳
            journal_name_start = list[0].find("label") + 8
            journal_name_end = list[0].find("\",\"", journal_name_start)
            journal_name = list[0][journal_name_start:journal_name_end]
            journal_name = journal_name.upper()  # 查找到的名字也是全大写
            msg.msg("journal name", journal_name_raw, "web retrieved",
                    journal_name, "debug", msg.display)
            return journal_name
            break
        except Exception, e:
            msg.msg("journal name", journal_name, "web retrieved", "retried",
                    "debug", msg.display)
            msg.msg("journal name", journal_name, "web retrieved", str(e),
                    "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)
示例#10
0
def adjust_record_number(project, sstr, record_number):  # 确定正确的记录数
    url = parse_url(project, sstr)
    tries = config.request_sp_tries  # 尝试3次
    while(tries > 0):
        try:
            opener = requests.Session()
            content = opener.get(url, timeout=config.request_time_out,
                                 headers=agents.get_header()).text  # header仍然可以是随机的
            max_record_number_start = content.find(
                "<h3 class=\"result_count left\">Items:") + 37  # 找描述开始地方
            max_record_number_end = content.find(
                '</h3>', max_record_number_start)
            record_number_str = content[max_record_number_start:max_record_number_end]
            max_record_number = int(record_number_str.split(" ")[-1])
            if max_record_number >= record_number:
                pass
            else:
                record_number = max_record_number
                msg.msg("record number", "", "changed", str(
                    record_number), "notice", msg.log, msg.display)
            return record_number
            break
        except Exception, e:
            msg.msg("record number", "", "read", str(e), "error", msg.log)
            msg.msg("record number", "", "read",
                    "retried", "notice", msg.display)
            tries -= 1
            time.sleep(config.request_refresh_wait)
示例#11
0
def crawl_direct(project, sstr):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
    url = parse_url(project, sstr)
    tries = config.request_sp_tries  # 尝试3次
    while(tries > 0):
        try:
            opener = requests.Session()
            content = opener.get(url, timeout=config.request_time_out,
                                 headers=agents.get_header()).text  # header仍然可以是随机的
            msg.msg("sum page", "1", "loaded", "proc", "info", msg.display)
            pmid_list = extract_new_pmid(content)  # 提取pmid, 然后排除旧的
            if pmid_list:
                mh.add_new_pmid_many(
                    project, sstr, ut.time_str("full"), "pm", pmid_list)
            msg.msg("sum page", "1", "loaded", "succ",
                    "info", msg.display, msg.log)
            break
        except Exception, e:
            msg.msg("sum page", "1", "loaded", str(e), "error", msg.log)
            msg.msg("sum page", "1", "loaded",
                    "retried", "notice", msg.display)
            tries -= 1
            time.sleep(config.request_refresh_wait)
示例#12
0
def extract_new_pmid(content):  # 从文本中提取pmid的通用办法
    pmid_set = []
    pmid_raw = re.findall("<dd>\d{8}</dd>", content)
    for pmid in pmid_raw:
        pmid = str(pmid[4:-5])  # 去处括号
        msg.msg("pmid", str(pmid), "retrieved", "proc",
                "info", msg.log, msg.display, msg.stat)
        if pmid not in existed_pmid_set:
            pmid_set.append(pmid)
            msg.msg("pmid", str(pmid), "retrieved", "succ",
                    "info", msg.log, msg.display, msg.stat)
        else:
            msg.msg("pmid", str(pmid), "skipped", "skip",
                    "info", msg.log, msg.display, msg.stat)
    return pmid_set
示例#13
0
def get_journal_info(ojournal_name, proxy=None):  # 查找杂志影响因子、分区, 要求输入精准
    url = "http://www.letpub.com.cn/index.php?page=journalapp&view=search"
    search_str = {
        "searchname": "",
        "searchissn": "",
        "searchfield": "",
        "searchimpactlow": "",
        "searchimpacthigh": "",
        "searchscitype": "",
        "view": "search",
        "searchcategory1": "",
        "searchcategory2": "",
        "searchjcrkind": "",
        "searchopenaccess": "",
        "searchsort": "relevance"
    }
    search_str["searchname"] = ojournal_name
    tries = config.request_dp_tries
    while tries > 0:
        try:
            opener = requests.Session()
            doc = opener.post(url, timeout=20, data=search_str).text
            selector = etree.HTML(doc.encode("utf-8"))
            journal_detail_element = selector.xpath(
                "//td[@style=\"border:1px #DDD solid; border-collapse:collapse; text-align:left; padding:8px 8px 8px 8px;\"]"
            )
            if len(journal_detail_element):
                impact_factor = journal_detail_element[2].xpath('string(.)')
                publication_zone = journal_detail_element[3].xpath(
                    'string(.)')[0]
            else:
                impact_factor = ""
                publication_zone = ""
            msg.msg("journal info", ojournal_name, "web retrieved", "succ",
                    "debug", msg.display)
            return impact_factor, publication_zone
            break
        except Exception, e:
            msg.msg("journal info", ojournal_name, "web retrieved", "retried",
                    "debug", msg.display)
            msg.msg("journal info", ojournal_name, "web retrieved", str(e),
                    "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)
示例#14
0
文件: player.py 项目: xhalo32/advpy
    def draw(self):

        p = self.player
        d = p.pos[0] + 120 * p.scoreboardside
        ratio = 0

        try:
            ratio = float(p.hits) / p.arrows_gone_by
            per = str(format(round(ratio * 100, 2), ".2f")) + "%"
        except:
            per = "%"

        message.msg(self.main.scr,
                    str(p.hits) + "/" + str(p.arrows_gone_by), [d, 10],
                    (0, 0, 255),
                    weight=p.scoreboardside)
        if p.arrows_gone_by:
            message.msg(self.main.scr,
                        per, [d, 30], (225 * (1 - ratio), 225 * ratio, 0),
                        size=20,
                        weight=p.scoreboardside)

        message.msg(self.main.scr,
                    p.points, [d, 100], (0, 0, 200),
                    weight=p.scoreboardside)
        message.msg(self.main.scr,
                    p.combo, [d, 130], (0, 200, 0),
                    weight=p.scoreboardside)
        message.msg(self.main.scr,
                    p.misclicks, [d, 160], (200, 0, 0),
                    weight=p.scoreboardside)
        message.msg(self.main.scr,
                    p.misses, [d, 190], (200, 200, 0),
                    weight=p.scoreboardside)

        message.msg(self.main.scr,
                    p.totalarrows, [p.pos[0], 20], (100, 200, 230),
                    weight=p.scoreboardside)
示例#15
0
            if max_record_number >= record_number:
                pass
            else:
                record_number = max_record_number
                msg.msg("record number", "", "changed", str(
                    record_number), "notice", msg.log, msg.display)
            return record_number
            break
        except Exception, e:
            msg.msg("record number", "", "read", str(e), "error", msg.log)
            msg.msg("record number", "", "read",
                    "retried", "notice", msg.display)
            tries -= 1
            time.sleep(config.request_refresh_wait)
    else:
        msg.msg("record number", "", "read", "fail",
                "error", msg.display, msg.log)


def extract_new_pmid(content):  # 从文本中提取pmid的通用办法
    pmid_set = []
    pmid_raw = re.findall("<dd>\d{8}</dd>", content)
    for pmid in pmid_raw:
        pmid = str(pmid[4:-5])  # 去处括号
        msg.msg("pmid", str(pmid), "retrieved", "proc",
                "info", msg.log, msg.display, msg.stat)
        if pmid not in existed_pmid_set:
            pmid_set.append(pmid)
            msg.msg("pmid", str(pmid), "retrieved", "succ",
                    "info", msg.log, msg.display, msg.stat)
        else:
            msg.msg("pmid", str(pmid), "skipped", "skip",
示例#16
0
            journal_name_end = list[0].find("\",\"", journal_name_start)
            journal_name = list[0][journal_name_start:journal_name_end]
            journal_name = journal_name.upper()  # 查找到的名字也是全大写
            msg.msg("journal name", journal_name_raw, "web retrieved",
                    journal_name, "debug", msg.display)
            return journal_name
            break
        except Exception, e:
            msg.msg("journal name", journal_name, "web retrieved", "retried",
                    "debug", msg.display)
            msg.msg("journal name", journal_name, "web retrieved", str(e),
                    "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)
    else:
        msg.msg("journal name", journal_name, "web retrieved", "fail", "error",
                msg.log, msg.display)
        return ""


def get_journal_info(ojournal_name, proxy=None):  # 查找杂志影响因子、分区, 要求输入精准
    url = "http://www.letpub.com.cn/index.php?page=journalapp&view=search"
    search_str = {
        "searchname": "",
        "searchissn": "",
        "searchfield": "",
        "searchimpactlow": "",
        "searchimpacthigh": "",
        "searchscitype": "",
        "view": "search",
        "searchcategory1": "",
        "searchcategory2": "",
示例#17
0
#!/usr/bin/python3
from message import sendmessage, sendmessageicon, sendmessagetitle, msg

sendmessage("hello you")
sendmessageicon("hello you","face-wink")
sendmessagetitle("message","title")
msg("message","title","face-angry")


示例#18
0
def crawl_detail(pmid, proxy=None):  # 爬具体页面
    link = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid)
    tries = config.request_dp_tries  # 根据设定重复次数
    msg.msg("record", pmid, "retrieved", "proc", "info", msg.display, msg.stat)
    while (tries > 0):
        try:
            authors = []
            institues = []
            countries = []
            flinks = []
            opener = requests.Session()  # 新建了session保存
            content = opener.get(
                link,
                timeout=config.request_time_out,
                headers=agents.get_header()).text  # 注意,这里是不断随机换agent的
            selector = etree.HTML(content.encode("utf-8"))
            title_element = selector.xpath(
                "//div[@class = \"rprt abstract\"]//h1")
            if len(title_element):
                title = title_element[0].xpath('string(.)')
            author_element = selector.xpath("//div[@class = \"auths\"]//a")
            if len(author_element):
                for author in author_element:
                    authors.append(author.xpath('string(.)'))
            journal_element = selector.xpath("//a[@alsec=\"jour\"]/@title")
            if len(journal_element):
                journal = journal_element[0]
                if journal:
                    journal_detail = jn.journal_detail(journal)
                    ojournal = journal_detail[0]
                    journal_if = journal_detail[1]
                    journal_zone = journal_detail[2]
            abstract_element = selector.xpath(
                "//*[@id=\"maincontent\"]/div/div[5]/div/div[4]")
            if len(abstract_element):
                abstract = abstract_element[0].xpath('string(.)')[8:]
            key_words_element = selector.xpath(
                "//*[@id=\"maincontent\"]/div/div[5]/div/div[5]/p")
            if len(key_words_element):
                key_words = key_words_element[0].xpath('string(.)').split("; ")
            else:
                key_words = []
            issue_element = selector.xpath("//div[@class = \"cit\"]")  # 年份
            if len(issue_element):
                issue_raw = issue_element[0].xpath('string(.)')
                issue_start = issue_raw.find(".")
                issue = issue_raw[issue_start + 2:issue_start + 6]
            institues_element = selector.xpath("//div[@class=\"afflist\"]//dd")
            if len(institues_element):
                for institue in institues_element:
                    institue = institue.xpath('string(.)')
                    institue = ut.regexp_replace(
                        institue, ut.re_email_pm)  # 去除pm的email语句
                    institue = ut.regexp_replace(
                        institue, ut.re_email_general)  # 去除所有中间的email
                    institue = institue.replace(" ,", ",")
                    institues.append(institue)
                    institue = institue.replace(", ", ",").replace(".", "")
                    institue_strs = institue.split(",")
                    institue_strs.reverse()  # 国家名往往放在最后
                    i = 0
                    while i < len(institue_strs):
                        if institue_strs[i] in dictionary.country_names.keys(
                        ):  # 如果有这个机构
                            country_name = dictionary.country_names[
                                institue_strs[i]]  # 直接查询
                            if not country_name in countries:
                                countries.append(country_name)
                            break
                        else:
                            i += 1
            flink_element = selector.xpath(
                "//div[@class=\"icons portlet\"]//a/@href")
            if len(flink_element):
                for flink in flink_element:
                    flinks.append(flink)
            mh.add_new_content(pmid, title, authors, journal, ojournal,
                               journal_if, journal_zone, issue, abstract,
                               key_words, institues, countries, flinks)
            msg.msg("record", pmid, "retrieved", "succ", "info", msg.display,
                    msg.stat)
            break
        except Exception, e:
            msg.msg("record", pmid, "retrieved", "retried", "notice",
                    msg.display)
            msg.msg("record", pmid, "retrieved", str(e), "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟
示例#19
0
                    flinks.append(flink)
            mh.add_new_content(pmid, title, authors, journal, ojournal,
                               journal_if, journal_zone, issue, abstract,
                               key_words, institues, countries, flinks)
            msg.msg("record", pmid, "retrieved", "succ", "info", msg.display,
                    msg.stat)
            break
        except Exception, e:
            msg.msg("record", pmid, "retrieved", "retried", "notice",
                    msg.display)
            msg.msg("record", pmid, "retrieved", str(e), "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟

    else:
        msg.msg("record", pmid, "retrieved", "fail", "error", msg.display,
                msg.log)
        return 0


def run_detail_crawler(project, sstr, record_number):
    pmid_list = get_pmid_list(project, sstr, record_number)
    pool = Pool(config.detail_crawler_number)  # 实例化进程池
    pool.map(crawl_detail, pmid_list)
    pool.close()  # 关闭进程池
    pool.join()  # 等待所有进程结束


if __name__ == '__main__':
    run_detail_crawler("test", "lactobacillus", 1000)
示例#20
0
def save_png(browser):
    browser.save_screenshot(
        ut.cur_file_dir() + "/browser/" + ut.time_str("time") + ".png")
    msg.msg("screenshot", "", "saved", "succ", "debug", msg.display, msg.log)