Пример #1
0
def get_owner_repo_map():
    """
    We use repo owner and repo name to distinguish
    one repo from another.
    """
    repo_info_list = get_repo_info(to_dict=False)
    ret_map = dict()
    for ri in repo_info_list:
        key = (ri.repo_owner, ri.repo_name)
        try:
            ret_map[key]
        except KeyError:
            ret_map[key] = list()
        ret_map[key].append(ri.paper_id)
    paper_data = get_papers_from_db()
    paper_map = dict()
    for pd in paper_data:
        paper_map[pd.id] = (pd.repo_owner, pd.repo_name)

    for (o, n) in ret_map.keys():
        pids = ret_map[(o, n)]
        for pid in pids:
            if paper_map[pid] != (o, n):
                print(o, n),
                print(paper_map[pid])
    return ret_map
Пример #2
0
def crawl_repo_stargazer():
    papers = get_papers_from_db()
    for p in papers:
        s = StargazerCrawler(p)
        while not s.end_crawl:
            result_json = s.get_next_page()
            s.result_to_disk(result_json)
            time.sleep(0.5)
Пример #3
0
 def combine_with_paper(self):
     paper_data = get_papers_from_db(with_citation=True)
     paper_obj = paper_data[self.paper_id - 1]
     setattr(self, 'title', paper_obj.title)
     setattr(self, 'conf', paper_obj.get_conf())
     setattr(self, 'year', paper_obj.year)
     setattr(self, 'citation', getattr(paper_obj, 'citation'))
     setattr(self, 'paper_repo_owner', paper_obj.repo_owner)
     setattr(self, 'paper_repo_name', paper_obj.repo_name)
Пример #4
0
def crawl_main():
    paper_data = get_papers_from_db()
    for pd in paper_data:
        print(pd.id)
        link = pd.link
        link2 = preprocess_url(link)
        website, save_path = get_paper_path(link2, pd.id)
        response = requests.get(link2)
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
Пример #5
0
def crawl_main():
    paper_data = get_papers_from_db()
    for pd in paper_data:
        print(pd.id)
        link = pd.link
        link2 = preprocess_url(link)
        save_path = get_paper_path(pd.id)
        if os.path.exists(save_path):
            continue
        print(pd.title)
        print(link2)
        if 'content_iccv' in link2:
            link2 = link2.replace('content_iccv', 'content_ICCV')
        if link2 != 'Hello':
            download_file(link2, save_path)
def get_anomaly_repo():
    repo_info_list = get_repo_info(to_dict=False)
    ret_map = dict()
    paper_data = get_papers_from_db()
    for ri in repo_info_list:
        key = (ri.repo_owner, ri.repo_name)
        try:
            ret_map[key]
        except KeyError:
            ret_map[key] = list()
        ret_map[key].append(ri.paper_id)
    for key in ret_map.keys():
        if len(ret_map[key]) > 1:
            for pid in ret_map[key]:
                print(paper_data[pid-1].title)
Пример #7
0
def store_repo_info():
    paper_data = get_papers_from_db()
    db_objs = list()
    for pd in paper_data:
        pd_id = getattr(pd, 'id')
        print(pd_id)
        r = Repo(pd)
        r.from_json()
        r.paper_id = pd_id
        if r.stars_count is None:
            continue
        db_objs.append(r.to_db_obj())
    db_api = DataBaseApi()
    db_api.insert_objs(db_objs)
    db_api.close_session()
Пример #8
0
def store_star_event():
    """
    For each repo:
        find all pages of json data;
        for each page of json data:
            for each json_obj in the page:
                store it into db.
    """
    paper_data = get_papers_from_db()
    db_obj_list = list()
    i = 1
    repo_set = set()
    for pd in paper_data:
        if (pd.repo_owner, pd.repo_name) in repo_set:
            continue
        print(i)
        repo_set.add((pd.repo_owner, pd.repo_name))
        json_dir_path = os.path.join(conf.star_path, pd.repo_owner,
                                     pd.repo_name)
        if not os.path.exists(json_dir_path):
            continue
        file_names = os.listdir(json_dir_path)
        file_number = len(file_names)
        j = 1
        while j <= file_number:
            json_path = os.path.join(json_dir_path, str(j) + '.json')
            j += 1
            with open(json_path, 'r', encoding='utf-8') as f:
                json_obj_list = simplejson.load(f)
                for json_obj in json_obj_list:
                    se = StarEvent(pd)
                    se.from_json_obj(json_obj)
                    if se.timestamp is not None:
                        db_obj_list.append(se.to_db_obj())
                    if len(db_obj_list) == 20000:
                        db_api = DataBaseApi()
                        db_api.insert_objs(db_obj_list)
                        db_api.close_session()
                        db_obj_list = list()

        i += 1
    if len(db_obj_list) > 0:
        db_api = DataBaseApi()
        db_api.insert_objs(db_obj_list)
        db_api.close_session()
Пример #9
0
def move_project_to_owner_dir():
    papers = get_papers_from_db()
    repo_num_map = dict()
    for pd in papers:
        repo_name = pd.repo_name
        try:
            repo_num_map[repo_name]
        except KeyError:
            repo_num_map[repo_name] = 0
        repo_num_map[repo_name] += 1

    for pd in papers:
        repo_name = pd.repo_name
        repo_owner = pd.repo_owner
        old_repo_path = os.path.join(conf.root_path, 'repos', repo_name)
        if repo_num_map[repo_name] == 1 and os.path.exists(old_repo_path):
            repo_owner_path = os.path.join(conf.repo_path, repo_owner)
            if not os.path.exists(repo_owner_path):
                os.makedirs(repo_owner_path)
            new_repo_path = os.path.join(repo_owner_path, repo_name)
            os.rename(old_repo_path, new_repo_path)
        else:
            clone_repos(pd)
Пример #10
0
            readme_path = os.path.join(repo_path, 'readme.md')
            with open(readme_path, 'r', encoding='utf-8', errors='ignore') as f:
                self.readme_content = f.read()
        self.html_soup = None


class MarkDownReadme(Readme):
    def to_html(self):
        readme_html = markdown(self.readme_content)
        html_soup = BeautifulSoup(readme_html, 'html.parser')
        self.html_soup = html_soup

    def parse_readme_html(self):
        header_index = range(1, 7)
        for hi in header_index:
            headers = self.html_soup.find_all('h'+str(hi))
            if len(headers) == 0:
                continue
            else:
                pass


if __name__ == '__main__':
    paper_data = get_papers_from_db()
    repo_set = set()
    for pd in paper_data:
        if (pd.repo_owner, pd.repo_name) in repo_set:
            continue
        repo_set.add((pd.repo_owner, pd.repo_name))
        mdr = MarkDownReadme(pd.repo_owner, pd.repo_name)
Пример #11
0
def crawl_repo_info():
    papers = get_papers_from_db()
    for p in papers:
        r = RepoInfoCrawler(p)
        r.crawl_to_disk()
Пример #12
0
from obj.paper import get_papers_from_db
from configuration import conf
import os
import re

paper_data = get_papers_from_db(with_citation=True)
i = 1
for pd in paper_data:
    repo_owner = pd.repo_owner
    repo_name = pd.repo_name
    repo_path = os.path.join(conf.repo_path, repo_owner, repo_name)
    if i > 378:
        print(pd.title, pd.code_link)
        print(pd.repo_owner)
        print(pd.link)

        if os.path.exists(repo_path):
            file_list = os.listdir(repo_path)
            readme_path = ''
            for f in file_list:
                if f.lower().startswith('readme.'):
                    readme_path = os.path.join(repo_path, f)
            if readme_path == '':
                readme_content = ''
            else:
                with open(readme_path, 'r', encoding='utf-8', errors='ignore') as readme_f:
                    readme_content = readme_f.read()
            temp = readme_content.lower()

            repo_desc = pd.get_repo_desc()