Пример #1
0
def extract_common_features(item):  #提取共同特征
    title_features = transform_feature(
        string_utils.clean_sentence(item["title"], stemming=True).lower(),
        "title")  #将title项提取特征
    keywords_features = []
    keywords = item.get("keywords")  #提取 keywords 项
    if keywords:
        keywords_features = transform_feature(
            [string_utils.clean_name(k) for k in keywords],
            'keyword')  #提取keyword中的单词, 去除连接符
    fields_features = []
    fields = item.get('fields')
    if fields:
        fields_features = transform_feature(
            [string_utils.clean_name(k) for k in fields], 'fields')
    venue_features = []  #提取机构venue项
    venue_name = item.get('venue', '')
    if venue_name:
        if len(venue_name) > 2:
            venue_features = transform_feature(
                string_utils.clean_sentence(venue_name.lower()),
                "venue")  #小写化 去除分界符 变换特征
    abst_features = []
    abst = item.get('abst')
    if abst:
        abst_features = transform_feature(
            string_utils.clean_sentence(abst.lower()), "abst")
    return title_features, keywords_features, venue_features, abst_features, fields_features
Пример #2
0
def extract_common_features(item):
    title_features = string_utils.clean_sentence(item["title"], stemming=True)
    keywords_features = []
    keywords = item.get("keywords")
    if keywords:
        for k in keywords:
            keywords_features.extend(
                string_utils.clean_sentence(k, stemming=True))
    venue_features = []
    venue_name = item.get('venue', '')
    if len(venue_name) > 2:
        venue_features = string_utils.clean_sentence(venue_name.lower(),
                                                     stemming=True)
    return title_features, keywords_features, venue_features
Пример #3
0
def extract_common_features(item):
    title_features = transform_feature(
        string_utils.clean_sentence(item["title"], stemming=True).lower(),
        "title")
    keywords_features = []
    keywords = item.get("keywords")
    if keywords:
        keywords_features = transform_feature(
            [string_utils.clean_name(k) for k in keywords], 'keyword')
    venue_features = []
    venue_name = item.get('venue', '')
    if len(venue_name) > 2:
        venue_features = transform_feature(
            string_utils.clean_sentence(venue_name.lower()), "venue")
    return title_features, keywords_features, venue_features
Пример #4
0
def extract_author_features(item, order=None):
    title_features, keywords_features, venue_features = extract_common_features(
        item)
    author_features = []
    for i, author in enumerate(item["authors"]):
        if order is not None and i != order:
            continue
        name_feature = []
        org_features = []
        org_name = string_utils.clean_name(author.get("org", ""))
        if len(org_name) > 2:
            org_features.extend(transform_feature(org_name, "org"))
        for j, coauthor in enumerate(item["authors"]):
            if i == j:
                continue
            coauthor_name = coauthor.get("name", "")
            coauthor_org = string_utils.clean_name(coauthor.get("org", ""))
            if len(coauthor_name) > 2:
                name_feature.extend(
                    transform_feature([string_utils.clean_name(coauthor_name)],
                                      "name"))
            if len(coauthor_org) > 2:
                org_features.extend(
                    transform_feature(
                        string_utils.clean_sentence(coauthor_org.lower()),
                        "org"))
        author_features.append(name_feature + org_features + title_features +
                               keywords_features + venue_features)
    author_features = list(chain.from_iterable(author_features))
    return author_features
Пример #5
0
def extract_author_features(item):
    title_features, keywords_features, venue_features = extract_common_features(
        item)

    name_features = []
    org_features = []
    for i, author in enumerate(item['authors']):
        name = string_utils.clean_name(author.get('name', ''))
        name = transform_feature(name, 'name')

        org = string_utils.clean_sentence(author.get('org', ''))
        org = org + ' ' + org.replace(' ', '_')
        org = transform_feature(org, 'org')

        if name:
            name_features.extend(name)
        if org:
            org_features.extend(org)

    title_features = ' '.join(title_features) + ' '
    keywords_features = ' '.join(keywords_features) + ' '
    venue_features = ' '.join(venue_features) + ' '
    name_features = ' '.join(name_features) + ' '
    org_features = ' '.join(org_features) + ' '

    author_features = name_features + org_features + title_features + keywords_features + venue_features
    return author_features
Пример #6
0
def extract_author_features(item, order=None):
    title_features, keywords_features, venue_features = extract_common_features(
        item)
    word_features = title_features + keywords_features + venue_features
    author_features = []

    for i, author in enumerate(item["authors"]):
        if order is not None and i != order:
            continue
        org_name = string_utils.clean_sentence(author.get("org", ""),
                                               stemming=True)
        if len(org_name) > 2:
            word_features += org_name

        for j, coauthor in enumerate(item["authors"]):
            if i == j:
                continue
            coauthor_name = coauthor.get("name", "")
            if (coauthor_name == None):
                continue
            if (len(coauthor_name.strip()) > 0):
                if len(coauthor_name.strip()) > 2:
                    author_features.append(
                        string_utils.clean_name(coauthor_name))
                else:
                    author_features.append(coauthor_name.lower())

    return author_features, word_features
Пример #7
0
def pre_data(author_dict,pub_dict): # 预处理字符串数据,去掉字符串中的停用词,将名字统一成小写
    author_new = dict() # 对作者名字进行预处理过的数据
    for name in author_dict:
        new_name = string_utils.clean_name_author(name)
        author_new[new_name] = author_dict[name]
    #
    # # 把pub中的字符串都进行预处理
    for id in pub_dict:
        for item in pub_string_item:
            pub_dict[id][item] = string_utils.clean_sentence(pub_dict[id][item])
        authors=[]
        for author in pub_dict[id]["authors"]:
            authors.append(
                {'name':string_utils.clean_name(author['name']),
                 'org':string_utils.clean_sentence(author['org'])}
            )
        pub_dict[id]["authors"] =authors

    dump_json(author_new, wfpath='../data/', wfname='train_author_new.json', indent=4)
    dump_json(pub_dict, wfpath='../data/', wfname='train_pub_new.json', indent=4)
    return author_new,pub_dict
Пример #8
0
def extract_author_features(item, order=None):  #提取作者特征 item中的order
    title_features, keywords_features, venue_features, abst_features, fields_features = extract_common_features(
        item)  #提取共同特征 标题, 关键字, 收录机构
    author_features = []
    for i, author in enumerate(item["authors"]):  #枚举第i个作者, author
        if order is not None and i != order:  #找到所要的 第order个作者
            continue
        name_feature = []  #姓名特征
        org_features = []  #机构特征
        org_name = string_utils.clean_name(author.get(
            "org", ""))  #格式化机构名 按".", "-", " "分割 小写化
        if len(org_name) > 2:
            org_features.extend(transform_feature(org_name, "org"))  #列表加列表
        for j, coauthor in enumerate(item["authors"]):  #枚举 合作者
            if i == j:
                continue
            coauthor_name = coauthor.get("name", "")  #获得名字
            coauthor_org = string_utils.clean_name(coauthor.get("org",
                                                                ""))  #获得格式化机构名
            if len(coauthor_name) > 2:
                name_feature.extend(
                    transform_feature([string_utils.clean_name(coauthor_name)],
                                      "name")  #格式化 与 特征变换
                )  #将合作者名特征加入 名字特征中
            if len(coauthor_org) > 2:
                org_features.extend(
                    transform_feature(
                        string_utils.clean_sentence(coauthor_org.lower()),
                        "org")  #格式化 与 特征变换
                )  #将合作者机构特征加入 机构特征中
        author_features.append(name_feature + org_features + title_features +
                               keywords_features + venue_features +
                               abst_features +
                               fields_features)  #将以上特征 都 整合 到 作者特征 中
    author_features = list(chain.from_iterable(author_features))  #创建 为 迭代器 列表
    return author_features  # 到这里 就是把 各个特征 对应的单词列表 合并到一个列表里了