示例#1
0
def get_space_by_year(data_path, start_year, end_year, threshold, level):
    space = {}
    space_citer = {}
    processed = Set()
    for y in range(start_year, end_year+1):
        space[y] = dict()
        space_citer[y] = dict()
        for dir_name, dir_names, file_names in os.walk(data_path):
            # print path to all file_names with extension
            files = []
            for filename in file_names:
                if filename not in processed:
                    file_path = os.path.join(dir_name, filename)
                    file_type = os.path.splitext(file_path)[1]
                    if file_type == '.xml':
                        xml = etree.parse(file_path)
                        if len(xml.xpath("//prism:coverDisplayDate", namespaces=ns)) > 0 and xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text is not None:
                            year = xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text[-4:]
                        elif len(xml.xpath("//prism:coverDate", namespaces=ns)) > 0:
                            year = xml.xpath("//prism:coverDate", namespaces=ns)[0].text
                        elif len(xml.xpath("//ce:copyright/@year", namespaces=ns)) > 0:
                            year = xml.xpath("//ce:copyright/@year", namespaces=ns)[0]
                        else:
                            print("without data", file_path)
                            continue

                        if y == int(year):
                            processed.add(filename)
                            id = xml.xpath("//dct:identifier",namespaces=ns)[0].text
                            space_citer[y][id] = citation.Citer(id, 0, 0, 0, 0)
                            citing_set_list = cd.xml_to_citation_list_consys(xml, level)
                            space_citer[y][id].units = citing_set_list
                            for u in citing_set_list:
                                for r in u:
                                    if r in space[y]:
                                        space[y][r] += 1
                                    else:
                                        space[y][r] = 1
        space[y] = OrderedDict(sorted(space[y].items(), key=lambda t: t[1], reverse=True))
        num_nodes = len(space[y])

        if threshold != 1:
            if threshold > 1:
                truncate = threshold
            else:
                truncate = int(threshold * num_nodes)

            space[y] =OrderedDict(itertools.islice(space[y].iteritems(), truncate))

            items = space[y].items()
            truncate_value = items[-1][1]
            for i in range(truncate-1, 0, -1):
                if items[i][1] != truncate_value:
                    truncate = i
                    break
            space[y] = OrderedDict(itertools.islice(space[y].iteritems(), truncate))

            del_keys = []
            print("original citers:" + str(y) + "-" + str(len(space_citer[y])))
            for c in space_citer[y]:
                del_keys_u = []
                for u in xrange(len(space_citer[y][c].units)):
                    space_citer[y][c].units[u] = set([elem for elem in space_citer[y][c].units[u] if elem in space[y]])
                    if len(space_citer[y][c].units[u]) < 2:
                        del_keys_u.append(u)
                del_keys_u.reverse()
                for k in del_keys_u:
                    del space_citer[y][c].units[k]
                if len(space_citer[y][c].units) == 0:
                    del_keys.append(c)
            for k in del_keys:
                del space_citer[y][k]

        print("Year " + str(y) + "- nodes:" + str(len(space[y])) + ", space: " + str(num_nodes) + ", citer:" + str(len(space_citer[y])))
    return [space, space_citer]
示例#2
0
def get_space(data_path, start_year, end_year, threshold, level):
    space = dict()
    space_citer = dict()
    for dir_name, dir_names, file_names in os.walk(data_path):
        # print path to all file_names with extension
        files = []
        for filename in file_names:
            file_path = os.path.join(dir_name, filename)
            file_type = os.path.splitext(file_path)[1]
            if file_type == '.xml':
                xml = etree.parse(file_path)
                if len(xml.xpath("//prism:coverDisplayDate", namespaces=ns)) > 0 and xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text is not None:
                    year = xml.xpath("//prism:coverDisplayDate", namespaces=ns)[0].text[-4:]
                elif len(xml.xpath("//prism:coverDate", namespaces=ns)) > 0:
                    year = xml.xpath("//prism:coverDate", namespaces=ns)[0].text
                elif len(xml.xpath("//ce:copyright/@year", namespaces=ns)) > 0:
                    year = xml.xpath("//ce:copyright/@year", namespaces=ns)[0]
                else:
                    print("without data", file_path)
                    continue

                id = xml.xpath("//dct:identifier",namespaces=ns)[0].text
                contexts = []
                if len(xml.xpath("//ce:bib-reference[contains(.//sb:maintitle,'function')]/@id", namespaces=ns))>0:
                    bib_id = str(xml.xpath("//ce:bib-reference[contains(.//sb:maintitle,'function')]/@id", namespaces=ns)[0])
                    cs = xml.xpath("//ce:cross-refs[contains(@refid,'"+bib_id+"')]//ancestor::s ", namespaces=ns)
                    for c in cs:
                        c_text = sentence_clean(c)
                        if (c_text is not None) and (len(re.sub(" ", "", c_text)) > 40):
                            contexts.append(citation.classes.Sentence(c_text))


                abstracts = []
                abstract_sentences = xml.xpath("//ce:abstract-sec//s", namespaces=ns)
                for s in abstract_sentences:
                    s_text = sentence_clean(s)
                    if (s_text is not None) and (len(re.sub(" ", "", s_text)) > 40):
                        abstracts.append(citation.classes.Sentence(s_text))
                if abstracts < 1:
                    continue


                space_citer[id] = citation.classes.Citer(id, int(year), file_path, 0, 0, 0, 0)
                doi_xml = xml.xpath("//ce:doi", namespaces=ns)
                if len(doi_xml) > 0:
                    space_citer[id].bib_id = str(doi_xml[0].text)
                    month = str(doi_xml[0].text).split(".")[-2]
                    if month.isdigit():
                        month = int(month)
                        space_citer[id].month = month
                space_citer[id].path = file_path
                space_citer[id].abstract = abstracts
                space_citer[id].citation_context = contexts
                citing_set_list = cd.xml_to_citation_list_consys(xml, level)
                space_citer[id].orig_units = citing_set_list
                space_citer[id].units = copy.deepcopy(citing_set_list)
                if len(citing_set_list) == 0:
                    del space_citer[id]
                    print("del "+id)
                    continue
                for u in citing_set_list:
                    for r in u:
                        if r in space:
                            space[r] += 1
                        else:
                            space[r] = 1

    space = OrderedDict(sorted(space.items(), key=lambda t: t[1], reverse=True))
    num_nodes = len(space)

    if threshold != 1:
        if threshold > 1:
            truncate = threshold
        else:
            truncate = int(threshold * num_nodes)

        space =OrderedDict(itertools.islice(space.iteritems(), truncate))

        items = space.items()
        truncate_value = items[-1][1]
        for i in range(truncate-1, 0, -1):
            if items[i][1] != truncate_value:
                truncate = i
                break
        space = OrderedDict(itertools.islice(space.iteritems(), truncate))

        del_keys = []
        print("original citers:" + str(len(space_citer)))
        for c in space_citer:
            del_keys_u = []
            for u in xrange(len(space_citer[c].units)):
                space_citer[c].units[u] = set([elem for elem in space_citer[c].units[u] if elem in space])
                if len(space_citer[c].units[u]) < 2:
                    del_keys_u.append(u)
            del_keys_u.reverse()
            for k in del_keys_u:
                del space_citer[c].units[k]
            if len(space_citer[c].units) == 0:
                del_keys.append(c)
        # for k in del_keys:
        #     del space_citer[k]

    # space_citer = sorted(space_citer.values(), key=operator.attrgetter('year','month'))
    print("All: " + "- nodes:" + str(len(space)) + ", space: " + str(num_nodes) + ", citer:" + str(len(space_citer)))
    return [space, space_citer]