예제 #1
0
 def __init__(self, scheduler):
     self.scheduler = scheduler
     self.all_items = []
     self.all_authors = []
     self.graph = Graph()
예제 #2
0
 def __init__(self, scheduler):
     self.scheduler = scheduler
     self.all_items = []
     self.all_authors = []
     self.graph = Graph()
예제 #3
0
class ItemPipeline():
    def __init__(self, scheduler):
        self.scheduler = scheduler
        self.all_items = []
        self.all_authors = []
        self.graph = Graph()

    def add_items(self, all_datas):
        # print('started to add items')
        self.dump_json(all_datas)
        self.scheduler.add(all_datas['cited_in'][:10] +
                           all_datas['references'][:10])
        self.add_to_graph(
            publication_uid=all_datas['datas']['publication_uid'],
            cited_in=all_datas['cited_in'],
            references=all_datas['references'])
        # print('finished adding')

    def add_authors(self, all_authors):

        json_format = {}
        json_format['author_name'] = all_authors['author_name']
        (all_authors.__delitem__('author_name'))
        (all_authors.__delitem__('number_of_pages'))
        json_format['publications'] = all_authors
        print('dumping')
        print(json_format)
        self.dump_author(json_format)
        print('dumped')
        for value in all_authors.values():
            self.scheduler.add(value)

    def dump_author(self, result):
        with open('authors/' + result['author_name'] + '.json',
                  'w') as json_file:
            json.dump(result, json_file)
        self.all_authors += [result]

    def dump_json(self, result):
        publication_uid = result['datas']['publication_uid']
        with open('all_jsons/' + str(publication_uid) + '.json',
                  'w') as json_file:
            json.dump(result, json_file)
        self.all_items += [result]

    def get_items_len(self):
        return len(self.all_items)

    def add_to_graph(self, publication_uid, cited_in, references):
        self.graph.add_node(publication_uid)
        for link in cited_in:
            self.graph.add_edge(get_uid_from_url(link), publication_uid)
        for link in references:
            self.graph.add_edge(publication_uid, get_uid_from_url(link))

    def get_graph(self):
        return self.graph

    def pickle_graph(self):
        with open('graph.pkl', 'wb') as pickled_graph:
            pickle.dump(self.graph, pickled_graph)

    def save_to_text_file(self):
        with open('graph.txt', 'w') as text_graph:
            text_graph.write(self.graph.__str__())
예제 #4
0
class ItemPipeline():
    def __init__(self, scheduler):
        self.scheduler = scheduler
        self.all_items = []
        self.all_authors = []
        self.graph = Graph()

    def add_items(self, all_datas):
        # print('started to add items')
        self.dump_json(all_datas)
        self.scheduler.add(all_datas['cited_in'][:10] + all_datas['references'][:10])
        self.add_to_graph(publication_uid=all_datas['datas']['publication_uid'], cited_in=all_datas['cited_in'],
                          references=all_datas['references'])
        # print('finished adding')

    def add_authors(self, all_authors):

        json_format = {}
        json_format['author_name'] = all_authors['author_name']
        (all_authors.__delitem__('author_name'))
        (all_authors.__delitem__('number_of_pages'))
        json_format['publications'] = all_authors
        print('dumping')
        print(json_format)
        self.dump_author(json_format)
        print('dumped')
        for value in all_authors.values():
            self.scheduler.add(value)


    def dump_author(self,result):
        with open('authors/'+ result['author_name']+'.json', 'w') as json_file:
            json.dump(result, json_file)
        self.all_authors += [result]

    def dump_json(self, result):
        publication_uid = result['datas']['publication_uid']
        with open('all_jsons/' + str(publication_uid) + '.json', 'w') as json_file:
            json.dump(result, json_file)
        self.all_items += [result]

    def get_items_len(self):
        return len(self.all_items)

    def add_to_graph(self, publication_uid, cited_in, references):
        self.graph.add_node(publication_uid)
        for link in cited_in:
            self.graph.add_edge(get_uid_from_url(link), publication_uid)
        for link in references:
            self.graph.add_edge(publication_uid, get_uid_from_url(link))

    def get_graph(self):
        return self.graph

    def pickle_graph(self):
        with open('graph.pkl', 'wb') as pickled_graph:
            pickle.dump(self.graph, pickled_graph)

    def save_to_text_file(self):
        with open('graph.txt', 'w') as text_graph:
            text_graph.write(self.graph.__str__())