def extract_common_features(item): #提取共同特征 title_features = transform_feature( string_utils.clean_sentence(item["title"], stemming=True).lower(), "title") #将title项提取特征 keywords_features = [] keywords = item.get("keywords") #提取 keywords 项 if keywords: keywords_features = transform_feature( [string_utils.clean_name(k) for k in keywords], 'keyword') #提取keyword中的单词, 去除连接符 fields_features = [] fields = item.get('fields') if fields: fields_features = transform_feature( [string_utils.clean_name(k) for k in fields], 'fields') venue_features = [] #提取机构venue项 venue_name = item.get('venue', '') if venue_name: if len(venue_name) > 2: venue_features = transform_feature( string_utils.clean_sentence(venue_name.lower()), "venue") #小写化 去除分界符 变换特征 abst_features = [] abst = item.get('abst') if abst: abst_features = transform_feature( string_utils.clean_sentence(abst.lower()), "abst") return title_features, keywords_features, venue_features, abst_features, fields_features
def extract_common_features(item): title_features = string_utils.clean_sentence(item["title"], stemming=True) keywords_features = [] keywords = item.get("keywords") if keywords: for k in keywords: keywords_features.extend( string_utils.clean_sentence(k, stemming=True)) venue_features = [] venue_name = item.get('venue', '') if len(venue_name) > 2: venue_features = string_utils.clean_sentence(venue_name.lower(), stemming=True) return title_features, keywords_features, venue_features
def extract_common_features(item): title_features = transform_feature( string_utils.clean_sentence(item["title"], stemming=True).lower(), "title") keywords_features = [] keywords = item.get("keywords") if keywords: keywords_features = transform_feature( [string_utils.clean_name(k) for k in keywords], 'keyword') venue_features = [] venue_name = item.get('venue', '') if len(venue_name) > 2: venue_features = transform_feature( string_utils.clean_sentence(venue_name.lower()), "venue") return title_features, keywords_features, venue_features
def extract_author_features(item, order=None): title_features, keywords_features, venue_features = extract_common_features( item) author_features = [] for i, author in enumerate(item["authors"]): if order is not None and i != order: continue name_feature = [] org_features = [] org_name = string_utils.clean_name(author.get("org", "")) if len(org_name) > 2: org_features.extend(transform_feature(org_name, "org")) for j, coauthor in enumerate(item["authors"]): if i == j: continue coauthor_name = coauthor.get("name", "") coauthor_org = string_utils.clean_name(coauthor.get("org", "")) if len(coauthor_name) > 2: name_feature.extend( transform_feature([string_utils.clean_name(coauthor_name)], "name")) if len(coauthor_org) > 2: org_features.extend( transform_feature( string_utils.clean_sentence(coauthor_org.lower()), "org")) author_features.append(name_feature + org_features + title_features + keywords_features + venue_features) author_features = list(chain.from_iterable(author_features)) return author_features
def extract_author_features(item): title_features, keywords_features, venue_features = extract_common_features( item) name_features = [] org_features = [] for i, author in enumerate(item['authors']): name = string_utils.clean_name(author.get('name', '')) name = transform_feature(name, 'name') org = string_utils.clean_sentence(author.get('org', '')) org = org + ' ' + org.replace(' ', '_') org = transform_feature(org, 'org') if name: name_features.extend(name) if org: org_features.extend(org) title_features = ' '.join(title_features) + ' ' keywords_features = ' '.join(keywords_features) + ' ' venue_features = ' '.join(venue_features) + ' ' name_features = ' '.join(name_features) + ' ' org_features = ' '.join(org_features) + ' ' author_features = name_features + org_features + title_features + keywords_features + venue_features return author_features
def extract_author_features(item, order=None): title_features, keywords_features, venue_features = extract_common_features( item) word_features = title_features + keywords_features + venue_features author_features = [] for i, author in enumerate(item["authors"]): if order is not None and i != order: continue org_name = string_utils.clean_sentence(author.get("org", ""), stemming=True) if len(org_name) > 2: word_features += org_name for j, coauthor in enumerate(item["authors"]): if i == j: continue coauthor_name = coauthor.get("name", "") if (coauthor_name == None): continue if (len(coauthor_name.strip()) > 0): if len(coauthor_name.strip()) > 2: author_features.append( string_utils.clean_name(coauthor_name)) else: author_features.append(coauthor_name.lower()) return author_features, word_features
def pre_data(author_dict,pub_dict): # 预处理字符串数据,去掉字符串中的停用词,将名字统一成小写 author_new = dict() # 对作者名字进行预处理过的数据 for name in author_dict: new_name = string_utils.clean_name_author(name) author_new[new_name] = author_dict[name] # # # 把pub中的字符串都进行预处理 for id in pub_dict: for item in pub_string_item: pub_dict[id][item] = string_utils.clean_sentence(pub_dict[id][item]) authors=[] for author in pub_dict[id]["authors"]: authors.append( {'name':string_utils.clean_name(author['name']), 'org':string_utils.clean_sentence(author['org'])} ) pub_dict[id]["authors"] =authors dump_json(author_new, wfpath='../data/', wfname='train_author_new.json', indent=4) dump_json(pub_dict, wfpath='../data/', wfname='train_pub_new.json', indent=4) return author_new,pub_dict
def extract_author_features(item, order=None): #提取作者特征 item中的order title_features, keywords_features, venue_features, abst_features, fields_features = extract_common_features( item) #提取共同特征 标题, 关键字, 收录机构 author_features = [] for i, author in enumerate(item["authors"]): #枚举第i个作者, author if order is not None and i != order: #找到所要的 第order个作者 continue name_feature = [] #姓名特征 org_features = [] #机构特征 org_name = string_utils.clean_name(author.get( "org", "")) #格式化机构名 按".", "-", " "分割 小写化 if len(org_name) > 2: org_features.extend(transform_feature(org_name, "org")) #列表加列表 for j, coauthor in enumerate(item["authors"]): #枚举 合作者 if i == j: continue coauthor_name = coauthor.get("name", "") #获得名字 coauthor_org = string_utils.clean_name(coauthor.get("org", "")) #获得格式化机构名 if len(coauthor_name) > 2: name_feature.extend( transform_feature([string_utils.clean_name(coauthor_name)], "name") #格式化 与 特征变换 ) #将合作者名特征加入 名字特征中 if len(coauthor_org) > 2: org_features.extend( transform_feature( string_utils.clean_sentence(coauthor_org.lower()), "org") #格式化 与 特征变换 ) #将合作者机构特征加入 机构特征中 author_features.append(name_feature + org_features + title_features + keywords_features + venue_features + abst_features + fields_features) #将以上特征 都 整合 到 作者特征 中 author_features = list(chain.from_iterable(author_features)) #创建 为 迭代器 列表 return author_features # 到这里 就是把 各个特征 对应的单词列表 合并到一个列表里了