def custom_similarity(person, key, words_dict): result = 0 person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt") if os.path.isfile(person_file): with open(person_file, 'r', encoding='utf8') as fr: tfidf_words = words_dict[key] sorted_tfidf_words = sorted(tfidf_words.items(), key=operator.itemgetter(1)) max_weight = sorted_tfidf_words[-1][1] file_content = fr.read() person_words = p_lib.split_to_words(file_content.lower()) document_dict = Counter(person_words) sentences_count = len(split_into_sentences(file_content)) for word in tfidf_words: current_word_weight = tfidf_words[word] / max_weight if word in document_dict: result += document_dict[word] * current_word_weight if sentences_count == 0: sentences_count = 1 result = result / sentences_count result *= 55 if result > 7: return 7 return result
def is_profession_negative(person, profession): similarity_words = prof_lib.get_similarity_words(profession) person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt") if os.path.isfile(person_file): with open(person_file, 'r', encoding='utf8') as fr: content = fr.read() if any(x in content for x in similarity_words): return False return True
def find_nationality_similarity(person_name, nationality): global model person_name = persons.remove_spaces(person_name) nationality = nationalities.remove_spaces(nationality) try: return custom_similarity(abs(model.similarity(person_name.lower(), nationality.lower())), NATIONALITY_MULTIPLIER) except Exception as e: # logging.error(traceback.format_exc()) return definitions.DEFAULT_SIMILARITY
def is_nationality_negative(person, nationality): person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt") if os.path.isfile(person_file): with open(person_file, 'r', encoding='utf8') as fr: content = fr.read() for synonym, coun in nat_lib.nationalities_dict.items(): content = content.replace(synonym, coun) if nationality in content: return False return True
def handle_mayreferto_person(*args): line = args[0] person_name = line.split(' ', 1)[0] modified_name = persons.remove_spaces(person_name) file_name = os.path.join(PERSONS_DIR, modified_name + '.txt') if os.path.isfile(file_name) and os.path.getsize(file_name) < 200: with open(file_name, encoding='utf8', mode='r') as person_file: first_line = person_file.readline() if 'may refer to' in first_line\ or 'is the name of' in first_line: person_file.close() os.remove(person_file.name) download_file(person_name, file_name) print(person_name)
def add_training_data(professions): with open(os.path.join(TRAINING_DIR, 'all_positive_profession.train'), encoding='utf8', mode='r') as f: for i, line in enumerate(f): splitted = line.rstrip().split(' ') person = splitted[0] profession = splitted[1] with open(os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt"), 'r', encoding='utf8') as pf: professions[profession] += "\n" + pf.read() print(i, person)
def add_training_data(nationalities): with open(os.path.join(TRAINING_DIR, 'all_positive_nationality.train'), encoding='utf8', mode='r') as f: for i, line in enumerate(f): splitted = line.rstrip().split(' ') person = splitted[0] nationality = splitted[1] with open(os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt"), 'r', encoding='utf8') as pf: nationalities[nationality] += pf.read() + "\n" print(i, person)
def find_profession_similarity(person_name, profession): global model person_name = persons.remove_spaces(person_name) profession_words = professions.get_similarity_words(profession) result = 0 total_count = 0 for word in profession_words: try: result += abs(model.similarity(person_name.lower(), word.lower())) total_count += 1 except Exception as e: # logging.error(traceback.format_exc()) return definitions.DEFAULT_SIMILARITY result /= total_count return custom_similarity(result, PROFESSION_MULTIPLIER)
def find_similarity(person_name, term, inputType): result = {} person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person_name) + ".txt") if os.path.isfile(person_file): with open(person_file, 'r', encoding='utf8') as f: if inputType == definitions.TYPE_NATIONALITY: result = get_person_nationalities(f) elif inputType == definitions.TYPE_PROFESSION: result = get_person_professions(f) else: raise TypeError if (term in result.keys()): return result[term] return 0
def get_positive_nationality(person): nationalities_empty_dict = init_nationalities_empty_dict() person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt") if os.path.isfile(person_file): with open(person_file, 'r', encoding='utf8') as fr: first_line = fr.readline() fr.seek(0) content = fr.read() for synonym, coun in nat_lib.nationalities_dict.items(): first_line = first_line.replace(synonym, coun) content = content.replace(synonym, coun) mentioned_nationalities = tuple( temp_nationality for temp_nationality in nationalities_empty_dict if temp_nationality in content) if len(mentioned_nationalities) == 2 and 'Republic of Ireland' in mentioned_nationalities: mentioned_nationalities = ['Republic of Ireland'] if len(mentioned_nationalities) == 1 and mentioned_nationalities[0] in first_line: return mentioned_nationalities[0] return None
def get_positive_profession(person): professions_empty_dict = init_professions_empty_dict() person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt") if os.path.isfile(person_file): with open(person_file, 'r', encoding='utf8') as fr: first_line = fr.readline() fr.seek(0) content = fr.read() mentioned_professions = [] mentioned_professions_first_sentence = [] for profession in professions_empty_dict: similarity_words = prof_lib.get_similarity_words(profession) if all(x in content for x in similarity_words): mentioned_professions.append(profession) if all(x in first_line for x in similarity_words): mentioned_professions_first_sentence.append(profession) if len(mentioned_professions) == 1 and len(mentioned_professions_first_sentence) == 1: return mentioned_professions_first_sentence[0] return None
def download_file(*args): line = args[0] person_name = line.split(' ', 1)[0] modified_name = persons.remove_spaces(person_name) file_name = os.path.join(PERSONS_DIR, modified_name + '.txt') url = 'http://dbpedia.org/page/' + urllib.parse.quote(modified_name) dbpedia_file = None try: if not os.path.isfile(file_name): html_content = get_html_content(url) html_content = modify_html_content(html_content) if len(html_content) > 0: dbpedia_file = open(file_name, encoding='utf8', mode='x') dbpedia_file.write(html_content) except urllib.error.HTTPError as e: print(str(e.code) + ": " + url) except Exception as e: logging.error(traceback.format_exc()) finally: if dbpedia_file != None: dbpedia_file.close()
import os from wsdm.ts.helpers.persons import persons from definitions import NOMENCLATURES_DIR from definitions import PERSONS_DIR if __name__ == '__main__': with open(os.path.join(NOMENCLATURES_DIR, "persons.txt"), encoding='utf8', mode='r') as fr: with open(os.path.join(NOMENCLATURES_DIR, "missing_persons.txt"), encoding='utf8', mode='w') as fw: for line in fr: person_name = line.split(' ', 1)[0] modified_name = persons.remove_spaces(person_name) file_name = os.path.join(PERSONS_DIR, modified_name + '.txt') if not os.path.isfile(file_name): fw.write(line)
def has_file(person): person_file = os.path.join(definitions.PERSONS_DIR, p_lib.remove_spaces(person) + ".txt") return os.path.isfile(person_file)