示例#1
0
def main():
    arg_parser = ArgumentParser(
        prog='PhotoGet',
        description=
        'A program for getting someone\'s photos from social networks',
        epilog='© 2015, Dmitry Starov')

    arg_parser.add_argument(
        '-j',
        '--jsonPath',
        help='A path to the JSON file with your lover\'s info',
        required=True,
        dest='json_path')
    arg_parser.add_argument(
        '-s',
        '--savePath',
        help='A path where all downloaded photos will be stored',
        required=True,
        dest='save_path')
    arg_parser.add_argument('-l',
                            '--vkLogin',
                            help='Your VK login for authentication',
                            required=True,
                            dest='vk_login')
    arg_parser.add_argument('-p',
                            '--vkPass',
                            help='Your VK password for authentication',
                            required=True,
                            dest='vk_password')

    args = arg_parser.parse_args()

    with open(CONFIG_PATH, 'r') as config_file:
        config: Dict[str, int] = json_load(config_file)
    with open(args.json_path, 'r') as json_path:
        people: List[Dict[str,
                          Union[str,
                                List[str]]]] = json_load(json_path)['people']

    vk_links = VK(app_id=config['vk_app_id'],
                  scope=config['vk_scope'],
                  login=args.vk_login,
                  password=args.vk_password).get_links(people=people)

    not_downloaded = Downloader().download(links=vk_links,
                                           save_path=args.save_path)
    if len(not_downloaded) > 0:
        while len(not_downloaded) > 0:
            not_downloaded = Downloader.download(links=not_downloaded,
                                                 save_path=args.save_path)
 def get_papers(self):
     if not self.valid:
         return
     self.selector = etree.HTML(Downloader('http://aan.how/browse/author/publications/' + self._id)())
     papers_url_id = deep_select(self.selector, return_type="list",
                                 xpath="///tr[@class='gradeA']/td[2]/a/@href")
     self.author.papers_count = len(papers_url_id)
     self.author.publications = [to_num(x) for x in papers_url_id]
示例#3
0
 def __init__(self):
     if not os.path.exists('output'):
         os.mkdir('output')
     self.bs_books = {}
     self.stopped = False
     self.queue = []
     self.category_dict = {}
     self.downloader = Downloader(0, RedisCache())
     pass
示例#4
0
 def get_in_citation(self):
     if not self.valid:
         return
     self.selector = etree.HTML(
         Downloader('http://aan.how/browse/incoming_citations/' +
                    self._id)())
     in_citations = deep_select(self.selector,
                                return_type="list",
                                xpath='//a/@href')
     if in_citations:
         self.paper.in_citations = [to_num(x) for x in in_citations]
示例#5
0
	def __init__(self, downloader = None):
		mkdir(spring_dir)
		mkdir(content_dir)
		mkdir(package_dir)

		if not os.path.exists(pool_dir):
			os.mkdir(pool_dir)
			for i in range(0, 256):
				os.mkdir(os.path.join(pool_dir, '%02x' % i))

		self.__downloader = downloader or Downloader(os.path.join(content_dir, 'downloader.cfg'))
		self.__repositories = RepositorySource(content_dir, self.__downloader)
		self.__packages = PackageSource(content_dir, self.__repositories)
		self.__pinned_tags = PinnedTags()
    def get_partners(self):
        if not self.valid:
            return
        self.selector = etree.HTML(Downloader('http://aan.how/browse/author/collaborators/' + self._id)())
        name = deep_select(self.selector, return_type="list",
                           xpath="//tr[@class='gradeA']/td[1]/a/text()")
        self.author.partners_full_name = name
        # 合作文章数量
        num = deep_select(self.selector, return_type="list",
                          xpath="//tr[@class='gradeA']/td[2]/text()")
        for x in range(len(name)):
            papers_id = deep_select(self.selector, return_type="list",
                                    xpath="//tr[@class='gradeA'][" + str(x + 1) + "]/td[3]/a/text()")
            self.author.collaborators.append({"author": name[x], "num": num[x], "papers_id": papers_id})

        partners_id = deep_select(self.selector, return_type="list",
                                  xpath="//tr[@class='gradeA']/td[1]/a/@href")
        self.author.partners_id = [to_num(x) for x in partners_id]
 def __init__(self, _id, content=None, **kwargs):
     self._id = str(_id)
     self.author = Author(_id=self._id)
     if not self.author.full_name:
         self.valid = True
     else:
         return
     if self.author.insert_flag:
         if content is None:
             self.content = Downloader(host + self._id)()
             if self.content:
                 self.valid = True
             else:
                 self.logger.error("当前网页为空,无法进行解析\t_id:" + self._id)
                 self.valid = False
                 return
         else:
             self.valid = True
             self.content = content
         self.selector = etree.HTML(self.content)
示例#8
0
 def __init__(self, _id, content=None, **kwargs):
     self._id = str(_id)
     self.paper = Paper()
     page_data = col_paper.find_one({"url_id": self._id})
     if page_data:
         # 数据库中已经存在,直接返回
         return
     if content is None:
         self.content = Downloader(host + self._id)()
         if self.content:
             self.valid = True
         else:
             logger.error("当前网页为空,无法进行解析\turl_id:" + self._id)
             self.valid = False
             return
     else:
         self.valid = True
         self.content = content
     self.selector = etree.HTML(self.content)
     self.paper.url_id = self._id
示例#9
0
 def get_citing_sentence(self):
     if not self.valid:
         return
     self.selector = etree.HTML(
         Downloader('http://aan.how/browse/citing_sentences/' + self._id)())
     paper_id = deep_select(self.selector,
                            return_type="list",
                            xpath='//a/text()')
     sentence = deep_select(self.selector,
                            return_type="list",
                            xpath="//tr/td[4]/div/text()")
     line = deep_select(self.selector,
                        return_type="list",
                        xpath="//tr/td[3]/text()")
     if paper_id and sentence:
         for x in range(len(paper_id)):
             citing_sentences = {
                 "paper_id": paper_id[x],
                 "sentence": clean(sentence[x]),
                 "line": line[x]
             }
             self.paper.citing_sentences.append(citing_sentences)
示例#10
0
from util.downloader import Downloader
file_name = "../data_set/data.html"
url = 'http://aan.how/browse/paper/30000'
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(Downloader(url)())
print("写入完成")
# -*- coding: utf-8 -*-
# !/usr/bin/python3
"""
-------------------------------------------------
   File Name:     test_spider.py
   Author :        Carl
   Author_email:   [email protected]
   date:          test_spider.py
   Description :
-------------------------------------------------
#  If this run wrong, don't ask me , I don't know why;
#  If this run right, thank god, and I don't know why.
#  Maybe the answer, my friend, is blowing in the wind.                 
-------------------------------------------------
"""
__author__ = 'Carl'
from lxml import etree
from util.downloader import Downloader
from util.stringUtil import deep_select

# content = open("data.html", "r", encoding='utf-8').read()
content = Downloader(url='http://aan.how/browse/author/publications/3835')()
selector = etree.HTML(content)
# f = open("data.html", "w", encoding="utf-8")
# f.write(etree.tostring(selector).decode("utf-8"))
# f.close()
author = "//tr[@class='gradeA']/td[2]/a/@href"
print(len(deep_select(selector, return_type="list", xpath=author)))