예제 #1
0
def getPapersAbstract():
    from collections import defaultdict
    from dcclient import DataCenterClient
    f = open("E:\\ids.txt")
    f.next()
    ids = []
    import codecs
    f_out = codecs.open("E:\\abstracts_1.txt","w",encoding="utf-8")
    from bs4 import UnicodeDammit
    for line in f:
        x = line.split("\n")
        ids.append(int(x[0]))
    print len(ids)
    c = DataCenterClient("tcp://10.1.1.211:32011")
    for i in range(len(ids)/1000):
        print "DUMP %s"%(i*1000)
        x = c.getPublicationsById(ids[i*1000:(i+1)*1000])
        id_set = set(ids)
        count = 0
        abs = {}
        conf = {}
        authors = {}
        title = {}
        year = {}
        for p in x.publications:
            abs[p.id] = p.abs.replace("\n"," ").replace("\t"," ")
            conf[p.id] = p.jconf_name
            authors[p.id] = ",".join([str(a) for a in p.author_ids])
            title[p.id] = p.title
            year[p.id] = p.year
        for p in abs:
            if len(abs[p]) > 2:
                f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n"%(p,year[p],conf[p],authors[p],title[p],UnicodeDammit(abs[p]).markup))
예제 #2
0
def getPapersCitation():
    from collections import defaultdict
    from dcclient import DataCenterClient

    f = open("E:\\vis.txt")
    f.next()
    ids = []
    for line in f:
        x = line.split("\t")
        ids.append(int(x[0]))
    c = DataCenterClient("tcp://10.1.1.211:32011")
    x = c.getPublicationsById(ids)
    id_set = set(ids)
    count = 0
    citation = defaultdict(set)
    for p in x.publications:
        for y in p.cite_pubs:
            if y in id_set:
                print count
                count += 1
                citation[p.id].add(y)
        for y in p.cited_by_pubs:
            if y in id_set:
                print count
                count += 1
                citation[y].add(p.id)
    f_out = open("E:\\citation.txt", "w")
    for p in citation:
        for q in citation[p]:
            f_out.write("%s\t%s\n" % (p, q))
예제 #3
0
def getPapersCitation():
    from collections import defaultdict
    from dcclient import DataCenterClient
    f = open("E:\\vis.txt")
    f.next()
    ids = []
    for line in f:
        x = line.split("\t")
        ids.append(int(x[0]))
    c = DataCenterClient("tcp://10.1.1.211:32011")
    x = c.getPublicationsById(ids)
    id_set = set(ids)
    count = 0
    citation = defaultdict(set)
    for p in x.publications:
        for y in p.cite_pubs:
            if y in id_set:
                print count
                count += 1
                citation[p.id].add(y)
        for y in p.cited_by_pubs:
            if y in id_set:
                print count
                count += 1
                citation[y].add(p.id) 
    f_out = open("E:\\citation.txt","w")
    for p in citation:
        for q in citation[p]:
            f_out.write("%s\t%s\n"%(p,q))
def getPapersAbstract():
    from collections import defaultdict
    from dcclient import DataCenterClient
    f = open("E:\\ids.txt")
    f.next()
    ids = []
    import codecs
    f_out = codecs.open("E:\\abstracts_1.txt", "w", encoding="utf-8")
    from bs4 import UnicodeDammit
    for line in f:
        x = line.split("\n")
        ids.append(int(x[0]))
    print len(ids)
    c = DataCenterClient("tcp://10.1.1.211:32011")
    for i in range(len(ids) / 1000):
        print "DUMP %s" % (i * 1000)
        x = c.getPublicationsById(ids[i * 1000:(i + 1) * 1000])
        id_set = set(ids)
        count = 0
        abs = {}
        conf = {}
        authors = {}
        title = {}
        year = {}
        for p in x.publications:
            abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ")
            conf[p.id] = p.jconf_name
            authors[p.id] = ",".join([str(a) for a in p.author_ids])
            title[p.id] = p.title
            year[p.id] = p.year
        for p in abs:
            if len(abs[p]) > 2:
                f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n" %
                            (p, year[p], conf[p], authors[p], title[p],
                             UnicodeDammit(abs[p]).markup))
예제 #5
0
def getCitationNetwork():
    import time
    import datetime
    from collections import defaultdict

    c = DataCenterClient("tcp://10.1.1.211:32011")
    x = c.searchPublications("deep learning")
    data_fields = [
        "id", "mid", "uid", "parent", "type", "t", "user_created_at",
        "followers_count", "statuses_count", "friends_count", "username",
        "text", "words", "verified", "emotion"
    ]
    items = []
    cite_pubs = []
    key_terms = defaultdict(int)
    year_terms = defaultdict(lambda: defaultdict(int))
    for p in x.publications:
        if p.year <= 1970:
            continue
        item, children, parents, kt = extractPublication(p)
        if len(children) > 0:
            items.append(item)
        cite_pubs.extend(children)
        cite_pubs.extend(parents)
        for k in kt:
            key_terms[k.lower()] += 1
            year_terms[p.year][k.lower()] += 1
    cite_pubs = list(set(cite_pubs))
    x = c.getPublicationsById(cite_pubs)
    for p in x.publications:
        if p.year <= 1970:
            continue
        item, children, parents, kt = extractPublication(p)
        if len(children) > 0 and len(children) > 0:
            items.append(item)
        cite_pubs.extend(children)
        for k in kt:
            key_terms[k.lower()] += 1
            year_terms[p.year][k.lower()] += 1

    sorted_key_terms = sorted(key_terms.items(),
                              key=lambda x: x[1],
                              reverse=True)

    import json

    dump = open("pubs_dump.json", "w")
    d = json.dumps(items)
    dump.write(d)
    dump.close()
예제 #6
0
def getPapersAbstractYearConf():
    from collections import defaultdict
    from dcclient import DataCenterClient
    import codecs
    from bs4 import UnicodeDammit
    import os

    f = open("E:\\ids.txt")
    f.next()
    ids = []
    for line in f:
        x = line.split("\n")
        ids.append(int(x[0]))
    c = DataCenterClient("tcp://10.1.1.211:32011")

    def createFile(year, conf):
        if not os.path.exists(str(year)):
            os.makedirs(str(year))
        return codecs.open(os.path.join(str(year), conf),
                           "w",
                           encoding="utf-8")
        #files = defaultdict(dict)

    files = {}
    for i in range(len(ids) / 10000):
        print "DUMP %s" % (i * 10000)
        x = c.getPublicationsById(ids[i * 10000:(i + 1) * 10000])
        id_set = set(ids)
        count = 0
        abs = {}
        conf = {}
        title = {}
        year = {}
        for p in x.publications:
            abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ")
            conf[p.id] = p.jconf_name  #.replace("/"," ").replace("*"," ")
            title[p.id] = p.title
            year[p.id] = p.year
        for p in abs:
            if len(abs[p]) > 2 and len(conf[p]) > 1:
                #if not files[year[p]].has_key(conf[p]):
                if not files.has_key(year[p]):
                    files[year[p]] = codecs.open(str(year[p]),
                                                 "w",
                                                 encoding="utf-8")
                    #files[year[p]][conf[p]] = createFile(year[p], conf[p])
                file = files[year[p]]
                file.write(
                    "%s\n%s\n%s\n%s\n" %
                    (p, conf[p], title[p], UnicodeDammit(abs[p]).markup))
예제 #7
0
def getCitationNetwork():
    import time
    import datetime
    from collections import defaultdict

    c = DataCenterClient("tcp://10.1.1.211:32011")
    x = c.searchPublications("deep learning")
    data_fields = ["id", "mid", "uid",
                   "parent", "type", "t",
                   "user_created_at", "followers_count", "statuses_count",
                   "friends_count", "username", "text", "words", "verified", "emotion"];
    items = []
    cite_pubs = []
    key_terms = defaultdict(int)
    year_terms = defaultdict(lambda: defaultdict(int))
    for p in x.publications:
        if p.year <= 1970:
            continue
        item, children, parents, kt = extractPublication(p)
        if len(children) > 0:
            items.append(item)
        cite_pubs.extend(children)
        cite_pubs.extend(parents)
        for k in kt:
            key_terms[k.lower()] += 1
            year_terms[p.year][k.lower()] += 1
    cite_pubs = list(set(cite_pubs))
    x = c.getPublicationsById(cite_pubs)
    for p in x.publications:
        if p.year <= 1970:
            continue
        item, children, parents, kt = extractPublication(p)
        if len(children) > 0 and len(children) > 0:
            items.append(item)
        cite_pubs.extend(children)
        for k in kt:
            key_terms[k.lower()] += 1
            year_terms[p.year][k.lower()] += 1

    sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True)

    import json

    dump = open("pubs_dump.json", "w")
    d = json.dumps(items)
    dump.write(d)
    dump.close()
예제 #8
0
def getPapersAbstractYearConf():
    from collections import defaultdict
    from dcclient import DataCenterClient
    import codecs
    from bs4 import UnicodeDammit
    import os

    f = open("E:\\ids.txt")
    f.next()
    ids = []
    for line in f:
        x = line.split("\n")
        ids.append(int(x[0]))
    c = DataCenterClient("tcp://10.1.1.211:32011")

    def createFile(year, conf):
        if not os.path.exists(str(year)):
            os.makedirs(str(year))
        return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8")
        #files = defaultdict(dict)

    files = {}
    for i in range(len(ids) / 10000):
        print "DUMP %s" % (i * 10000)
        x = c.getPublicationsById(ids[i * 10000:(i + 1) * 10000])
        id_set = set(ids)
        count = 0
        abs = {}
        conf = {}
        title = {}
        year = {}
        for p in x.publications:
            abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ")
            conf[p.id] = p.jconf_name#.replace("/"," ").replace("*"," ")
            title[p.id] = p.title
            year[p.id] = p.year
        for p in abs:
            if len(abs[p]) > 2 and len(conf[p]) > 1:
                #if not files[year[p]].has_key(conf[p]):
                if not files.has_key(year[p]):
                    files[year[p]] = codecs.open(str(year[p]), "w", encoding="utf-8")
                    #files[year[p]][conf[p]] = createFile(year[p], conf[p])
                file = files[year[p]]
                file.write("%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup))