def get_no_match_texts(argv, texts1): def get_non_match(name1, bucket_words, matching_set): for word in name1.split(" "): if word in bucket_words: bucket = bucket_words[word] else: return None if len(bucket[1]) > 1: for name2 in bucket[1]: if (name1, name2[1]) not in matching_set: return name2[1] return None no_match_texts = [] #this should not be done here and needs to be fixed up before more work is done #it should instead be done by a singel function in matcher_functions #establish connection to database con, meta = connect(argv[1], argv[2], argv[3]) #load pairs from database aliases = get_aliases(con, meta) #create dictionaries assingning serial numbers to names and names from serial numbers num_to_word, word_to_num = create_double_num_dicts(aliases) #load the buckets from the database bucket_list is aranges as follows: #bucket_list[pair_of_buckets][bucket(this must be 0 or 1)][name (this represents a single name)][0 for number and 1 for pre-procced name] bucket_list, bucket_words = load_good_buckets('wordtable1', 'wordtable2', word_to_num, con, meta) for index in range(len(texts1)): new_text = get_non_match(texts1[index], bucket_words, aliases) if new_text == None: new_text = texts1[(index + 1) % len(texts1)] no_match_texts.append(new_text) return no_match_texts
def __init__(self, user, password, database, test_pairs, bucket_number): con, meta = connect(user, password, database) num_to_word, word_to_num = create_double_num_dicts( get_aliases(con, meta)) bucket_list, bucket_words = load_good_buckets('wordtable1', 'wordtable2', word_to_num, con, meta) self.rarity_match = {} for pair in bucket_list: if len(pair[0]) <= bucket_number and len(pair[1]) <= bucket_number: for i in range(len(pair[0])): for j in range(len(pair[1])): self.rarity_match[pair[0][i][1]] = pair[1][j][1] self.test_pairs = test_pairs
if os.path.isdir(fpath): raise ValueError('bad data directory') if sys.version_info < (3,): f = open(fpath) else: f = open(fpath, encoding='latin-1') #change to get from sql and not read from file con, meta = connect(argv[1], argv[2], argv[3]) #load pairs from database aliases = get_aliases(con, meta) for pair in aliases: num_tokens = len(pair[0].strip().split(' ')) + len(pair[1].strip().split(' ')) if 0 < num_tokens < MAX_SEQUENCE_LENGTH: texts1.append(pair[0]) texts2.append(pair[1]) f.close() #this returns a new set of texts to use as similar non-matches for texts1 def get_no_match_texts(argv, texts1):
parser.add_argument('-u', dest="user", help="username") parser.add_argument('-p', dest="password", help="password") parser.add_argument('-d', dest="db", help="dbname") parser.add_argument('-o', dest="output_file", help="output file name") parser.add_argument('-a', dest="num_pairs", help="number of same pairs in db", nargs='?', default=2, type=int) args = parser.parse_args() #change to get from sql and not read from file con, meta = connect(args.user, args.password, args.db) # get all names first entities, entity2names = process_aliases(con, meta) tokenizer = Tokenizer( num_words=Named_Entity_Recognition_Modified.MAX_NB_WORDS) tokenizer.fit_on_texts(entities) sequences = tokenizer.texts_to_sequences(entities) print(sequences) sequences = pad_sequences( sequences, maxlen=Named_Entity_Recognition_Modified.MAX_SEQUENCE_LENGTH) word_index = tokenizer.word_index num_words = len(word_index) + 1
parser.add_argument('-p', dest="password", help="password") parser.add_argument('-d', dest="db", help="dbname") parser.add_argument('-a', dest="num_pairs", help="number of same pairs in db", nargs='?', default=2, type=int) args = parser.parse_args() texts1 = [] # list of text samples in part 1 texts2 = [] # list of text samples in part 2 #change to get from sql and not read from file con, meta = connect(args.user, args.password, args.db) aliases = get_aliases_with_ids(con, meta) unique_aliases = [] # collect up all the anchors that are unique (anchors will get repeated if num_pairs > 2) prev = int(aliases[0][2]) unique_aliases.append(aliases[0]) for tuple in aliases: texts1.append(tuple[0]) texts2.append(tuple[1]) if int(tuple[2]) != prev: unique_aliases.append(tuple) prev = int(tuple[2]) print('Found %s texts.' % len(texts1))