Exemplo n.º 1
0
    def get_edges(self):
        """Return all edges from a file in which each line contains an (author,
        paper) pair."""
        records = util.iter_csv_fwrapper(self.paper_idmap_file)
        idmap = {record[0]: int(record[1]) for record in records}
        refg = igraph.Graph.Read_Picklez(self.paper_graph_file.open())
        records = util.iter_csv_fwrapper(self.author_file)
        rows = ((refg, idmap[paper_id], author_id)
                for author_id, paper_id in records)

        while True:
            edges = self.get_paper_edges(*rows.next())
            for edge in edges:
                yield edge
Exemplo n.º 2
0
    def get_edges(self):
        """Return all edges from a file in which each line contains an (author,
        paper) pair."""
        records = util.iter_csv_fwrapper(self.paper_idmap_file)
        idmap = {record[0]: int(record[1]) for record in records}
        refg = igraph.Graph.Read_Picklez(self.paper_graph_file.open())
        records = util.iter_csv_fwrapper(self.author_file)
        rows = ((refg, idmap[paper_id], author_id)
                for author_id, paper_id in records)

        while True:
            edges = self.get_paper_edges(*rows.next())
            for edge in edges:
                yield edge
Exemplo n.º 3
0
 def run(self):
     dict_file, vecs_file = self.input()
     dictionary = gensim.corpora.Dictionary.load(dict_file.path)
     records = util.iter_csv_fwrapper(vecs_file)
     repdoc_corpus = (doc.decode('utf-8').split('|') for _, doc in records)
     bow_corpus = (dictionary.doc2bow(doc) for doc in repdoc_corpus)
     gensim.corpora.MmCorpus.serialize(self.output().path, bow_corpus)
Exemplo n.º 4
0
    def run(self):
        refg = igraph.Graph()
        nodes = self.read_paper_vertices()
        refg.add_vertices(nodes)

        # Build and save paper id to node id mapping
        idmap = {str(v['name']): v.index for v in refg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            self.idmap_output_file, ('paper_id', 'node_id'), rows)

        # Now add venues to nodes as paper attributes
        for paper_id, venue in self.read_paper_venues():
            node_id = idmap[paper_id]
            refg.vs[node_id]['venue'] = venue

        # next add author ids
        for v in refg.vs:
            v['author_ids'] = []

        for author_id, paper_id in util.iter_csv_fwrapper(self.author_file):
            node_id = idmap[paper_id]
            refg.vs[node_id]['author_ids'].append(author_id)

        # Finally add edges from citation records
        citation_links = self.read_paper_references(idmap)
        refg.add_edges(citation_links)

        # Save in both pickle and graphml formats
        refg.write_picklez(self.pickle_output_file.path)
        refg.write_graphmlz(self.graphml_output_file.path)
        return refg
Exemplo n.º 5
0
    def run(self):
        refg = igraph.Graph()
        nodes = self.read_paper_vertices()
        refg.add_vertices(nodes)

        # Build and save paper id to node id mapping
        idmap = {str(v['name']): v.index for v in refg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(self.idmap_output_file,
                                   ('paper_id', 'node_id'), rows)

        # Now add venues to nodes as paper attributes
        for paper_id, venue in self.read_paper_venues():
            node_id = idmap[paper_id]
            refg.vs[node_id]['venue'] = venue

        # next add author ids
        for v in refg.vs:
            v['author_ids'] = []

        for author_id, paper_id in util.iter_csv_fwrapper(self.author_file):
            node_id = idmap[paper_id]
            refg.vs[node_id]['author_ids'].append(author_id)

        # Finally add edges from citation records
        citation_links = self.read_paper_references(idmap)
        refg.add_edges(citation_links)

        # Save in both pickle and graphml formats
        refg.write_picklez(self.pickle_output_file.path)
        refg.write_graphmlz(self.graphml_output_file.path)
        return refg
Exemplo n.º 6
0
 def read_paper_references(self, idmap):
     """Filter out references to papers outside dataset."""
     for paper_id, ref_id in util.iter_csv_fwrapper(self.refs_file):
         try:
             yield (idmap[paper_id], idmap[ref_id])
         except:
             pass
Exemplo n.º 7
0
    def run(self):
        lcc_pickle_file, venue_map_file = self.input()

        # Read in the LCC graph
        lcc = igraph.Graph.Read_Picklez(lcc_pickle_file.path)

        # Build the community mapping:
        # each venue id is mapped to one or more node ids (the community)
        records = util.iter_csv_fwrapper(venue_map_file)
        communities = {int(venue_id): [] for venue_id, _ in records}
        for v in lcc.vs:
            for venue_id in v['venues']:
                communities[venue_id].append(v.index)

        # retrieve output files
        by_venue_file, by_author_file = self.output()

        # save ground truth communities
        comms = sorted(communities.items())
        rows = (' '.join(map(str, comm)) for comm_num, comm in comms)
        with by_venue_file.open('w') as f:
            f.write('\n'.join(rows))

        # save venue info for each author separately
        records = sorted([(v.index, v['venues']) for v in lcc.vs])
        rows = (' '.join(map(str, venues)) for node_id, venues in records)
        with by_author_file.open('w') as f:
            f.write('\n'.join(rows))
Exemplo n.º 8
0
    def run(self):
        graph_file, idmap_file, paper_file, author_file = self.input()

        # Read in dependencies
        lcc = igraph.Graph.Read_GraphMLz(graph_file.path)
        author_venue_df = self.build_linked_venue_frame()
        venue_map = self.assign_venue_ids(author_venue_df)

        records = util.iter_csv_fwrapper(idmap_file)
        lcc_idmap = {record[0]: int(record[1]) for record in records}

        # Use sets in order to ensure uniqueness.
        for v in lcc.vs:
            v['venues'] = set()

        # Add the venue IDs to the node venue sets.
        for rownum, (author_id, venue) in author_venue_df.iterrows():
            node_id = lcc_idmap[str(author_id)]
            venue_id = venue_map[venue]
            lcc.vs[node_id]['venues'].add(venue_id)

        # Convert the sets to tuples.
        for v in lcc.vs:
            v['venues'] = tuple(v['venues'])

        # save a copy of the graph with venue info
        pickle_outfile, venue_map_outfile = self.output()
        lcc.write_picklez(pickle_outfile.path)  # lcc-author-citation-graph

        rows = ((vnum, venue) for venue, vnum in venue_map.iteritems())
        util.write_csv_to_fwrapper(
            venue_map_outfile, ('venue_id', 'venue_name'), rows)
Exemplo n.º 9
0
 def run(self):
     dict_file, vecs_file = self.input()
     dictionary = gensim.corpora.Dictionary.load(dict_file.path)
     records = util.iter_csv_fwrapper(vecs_file)
     repdoc_corpus = (doc.decode('utf-8').split('|') for _, doc in records)
     bow_corpus = (dictionary.doc2bow(doc) for doc in repdoc_corpus)
     gensim.corpora.MmCorpus.serialize(self.output().path, bow_corpus)
Exemplo n.º 10
0
    def run(self):
        lcc_pickle_file, venue_map_file = self.input()

        # Read in the LCC graph
        lcc = igraph.Graph.Read_Picklez(lcc_pickle_file.path)

        # Build the community mapping:
        # each venue id is mapped to one or more node ids (the community)
        records = util.iter_csv_fwrapper(venue_map_file)
        communities = {int(venue_id): [] for venue_id, _ in records}
        for v in lcc.vs:
            for venue_id in v['venues']:
                communities[venue_id].append(v.index)

        # retrieve output files
        by_venue_file, by_author_file = self.output()

        # save ground truth communities
        comms = sorted(communities.items())
        rows = (' '.join(map(str, comm)) for comm_num, comm in comms)
        with by_venue_file.open('w') as f:
            f.write('\n'.join(rows))

        # save venue info for each author separately
        records = sorted([(v.index, v['venues']) for v in lcc.vs])
        rows = (' '.join(map(str, venues)) for node_id, venues in records)
        with by_author_file.open('w') as f:
            f.write('\n'.join(rows))
Exemplo n.º 11
0
    def run(self):
        graph_file, idmap_file, paper_file, author_file = self.input()

        # Read in dependencies
        lcc = igraph.Graph.Read_GraphMLz(graph_file.path)
        author_venue_df = self.build_linked_venue_frame()
        venue_map = self.assign_venue_ids(author_venue_df)

        records = util.iter_csv_fwrapper(idmap_file)
        lcc_idmap = {record[0]: int(record[1]) for record in records}

        # Use sets in order to ensure uniqueness.
        for v in lcc.vs:
            v['venues'] = set()

        # Add the venue IDs to the node venue sets.
        for rownum, (author_id, venue) in author_venue_df.iterrows():
            node_id = lcc_idmap[str(author_id)]
            venue_id = venue_map[venue]
            lcc.vs[node_id]['venues'].add(venue_id)

        # Convert the sets to tuples.
        for v in lcc.vs:
            v['venues'] = tuple(v['venues'])

        # save a copy of the graph with venue info
        pickle_outfile, venue_map_outfile = self.output()
        lcc.write_picklez(pickle_outfile.path)  # lcc-author-citation-graph

        rows = ((vnum, venue) for venue, vnum in venue_map.iteritems())
        util.write_csv_to_fwrapper(venue_map_outfile,
                                   ('venue_id', 'venue_name'), rows)
Exemplo n.º 12
0
    def read_lcc_author_repdocs(self):
        """Read and return an iterator over the author repdoc corpus, which excludes
        the authors not in the LCC.
        """
        author_repdoc_file, _, lcc_idmap_file = self.input()

        with lcc_idmap_file.open() as lcc_idmap_f:
            lcc_author_df = pd.read_csv(lcc_idmap_f, header=0, usecols=(0,))
            lcc_author_ids = lcc_author_df['author_id'].values

        csv.field_size_limit(sys.maxint)
        records = util.iter_csv_fwrapper(author_repdoc_file)
        return (doc.split('|') for author_id, doc in records
                if int(author_id) in lcc_author_ids)
Exemplo n.º 13
0
    def read_lcc_author_repdocs(self):
        """Read and return an iterator over the author repdoc corpus, which excludes
        the authors not in the LCC.
        """
        author_repdoc_file, _, lcc_idmap_file = self.input()

        with lcc_idmap_file.open() as lcc_idmap_f:
            lcc_author_df = pd.read_csv(lcc_idmap_f, header=0, usecols=(0,))
            lcc_author_ids = lcc_author_df['author_id'].values

        csv.field_size_limit(sys.maxint)
        records = util.iter_csv_fwrapper(author_repdoc_file)
        return (doc.split('|') for author_id, doc in records
                if int(author_id) in lcc_author_ids)
Exemplo n.º 14
0
 def read_paper_repdocs(self):
     paper_file = self.input()
     for record in util.iter_csv_fwrapper(paper_file):
         repdoc = '%s %s' % (record[1], record[4])
         yield (record[0], repdoc.decode('utf-8'))
Exemplo n.º 15
0
 def read_paper_repdocs(self):
     paper_file = self.input()
     for record in util.iter_csv_fwrapper(paper_file):
         repdoc = '%s %s' % (record[1], record[4])
         yield (record[0], repdoc.decode('utf-8'))
Exemplo n.º 16
0
 def read_paper_venues(self):
     """Iterate through (paper_id, venue) pairs from the paper csv file."""
     for record in util.iter_csv_fwrapper(self.papers_file):
         yield (record[0], record[2])
Exemplo n.º 17
0
 def read_paper_references(self, idmap):
     """Filter out references to papers outside dataset."""
     for paper_id, ref_id in util.iter_csv_fwrapper(self.refs_file):
         try: yield (idmap[paper_id], idmap[ref_id])
         except: pass
Exemplo n.º 18
0
 def read_paper_venues(self):
     """Iterate through (paper_id, venue) pairs from the paper csv file."""
     for record in util.iter_csv_fwrapper(self.papers_file):
         yield (record[0], record[2])
Exemplo n.º 19
0
 def run(self):
     repdocs = util.iter_csv_fwrapper(self.input())
     docs = ((docid, doc.decode('utf-8')) for docid, doc in repdocs)
     vecs = ((docid, doctovec.vectorize(doc)) for docid, doc in docs)
     rows = ((docid, '|'.join(doc).encode('utf-8')) for docid, doc in vecs)
     util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
Exemplo n.º 20
0
 def read_repdocs(self):
     records = util.iter_csv_fwrapper(self.input())
     return (doc.split('|') for _, doc in records)
Exemplo n.º 21
0
 def read_repdocs(self):
     records = util.iter_csv_fwrapper(self.input())
     return (doc.split('|') for _, doc in records)
Exemplo n.º 22
0
 def run(self):
     repdocs = util.iter_csv_fwrapper(self.input())
     docs = ((docid, doc.decode('utf-8')) for docid, doc in repdocs)
     vecs = ((docid, doctovec.vectorize(doc)) for docid, doc in docs)
     rows = ((docid, '|'.join(doc).encode('utf-8')) for docid, doc in vecs)
     util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)