def get_no_match_texts(argv, texts1):
    def get_non_match(name1, bucket_words, matching_set):
        for word in name1.split(" "):
            if word in bucket_words:
                bucket = bucket_words[word]
            else:
                return None
            if len(bucket[1]) > 1:
                for name2 in bucket[1]:
                    if (name1, name2[1]) not in matching_set:
                        return name2[1]
        return None
    no_match_texts = []
    #this should not be done here and needs to be fixed up before more work is done
    #it should instead be done by a singel function in matcher_functions
    #establish connection to database
    con, meta = connect(argv[1], argv[2], argv[3])
    #load pairs from database
    aliases = get_aliases(con, meta)
    #create dictionaries assingning serial numbers to names and names from serial numbers
    num_to_word, word_to_num = create_double_num_dicts(aliases)
    #load the buckets from the database bucket_list is aranges as follows:
    #bucket_list[pair_of_buckets][bucket(this must be 0 or 1)][name (this represents a single name)][0 for number and 1 for pre-procced name]
    bucket_list, bucket_words = load_good_buckets('wordtable1', 'wordtable2', word_to_num, con, meta)
    for index in range(len(texts1)):
        new_text = get_non_match(texts1[index], bucket_words, aliases)
        if new_text == None:
            new_text = texts1[(index + 1) % len(texts1)]
        no_match_texts.append(new_text)
    return no_match_texts
Exemplo n.º 2
0
 def __init__(self, user, password, database, test_pairs, bucket_number):
     con, meta = connect(user, password, database)
     num_to_word, word_to_num = create_double_num_dicts(
         get_aliases(con, meta))
     bucket_list, bucket_words = load_good_buckets('wordtable1',
                                                   'wordtable2',
                                                   word_to_num, con, meta)
     self.rarity_match = {}
     for pair in bucket_list:
         if len(pair[0]) <= bucket_number and len(pair[1]) <= bucket_number:
             for i in range(len(pair[0])):
                 for j in range(len(pair[1])):
                     self.rarity_match[pair[0][i][1]] = pair[1][j][1]
     self.test_pairs = test_pairs
if os.path.isdir(fpath):

    raise ValueError('bad data directory')

if sys.version_info < (3,):

    f = open(fpath)

else:

    f = open(fpath, encoding='latin-1')


#change to get from sql and not read from file
con, meta = connect(argv[1], argv[2], argv[3])
#load pairs from database
aliases = get_aliases(con, meta)
for pair in aliases:

    num_tokens = len(pair[0].strip().split(' ')) + len(pair[1].strip().split(' '))

    if 0 < num_tokens < MAX_SEQUENCE_LENGTH:

        texts1.append(pair[0])

        texts2.append(pair[1])

f.close()
#this returns a new set of texts to use as similar non-matches for texts1
def get_no_match_texts(argv, texts1):
Exemplo n.º 4
0
    parser.add_argument('-u', dest="user", help="username")
    parser.add_argument('-p', dest="password", help="password")
    parser.add_argument('-d', dest="db", help="dbname")
    parser.add_argument('-o', dest="output_file", help="output file name")

    parser.add_argument('-a',
                        dest="num_pairs",
                        help="number of same pairs in db",
                        nargs='?',
                        default=2,
                        type=int)

    args = parser.parse_args()

    #change to get from sql and not read from file
    con, meta = connect(args.user, args.password, args.db)

    # get all names first
    entities, entity2names = process_aliases(con, meta)
    tokenizer = Tokenizer(
        num_words=Named_Entity_Recognition_Modified.MAX_NB_WORDS)
    tokenizer.fit_on_texts(entities)
    sequences = tokenizer.texts_to_sequences(entities)
    print(sequences)

    sequences = pad_sequences(
        sequences,
        maxlen=Named_Entity_Recognition_Modified.MAX_SEQUENCE_LENGTH)

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    parser.add_argument('-p', dest="password", help="password")
    parser.add_argument('-d', dest="db", help="dbname")
    parser.add_argument('-a',
                        dest="num_pairs",
                        help="number of same pairs in db",
                        nargs='?',
                        default=2,
                        type=int)

    args = parser.parse_args()

    texts1 = []  # list of text samples in part 1
    texts2 = []  # list of text samples in part 2

    #change to get from sql and not read from file
    con, meta = connect(args.user, args.password, args.db)
    aliases = get_aliases_with_ids(con, meta)

    unique_aliases = []

    # collect up all the anchors that are unique (anchors will get repeated if num_pairs > 2)
    prev = int(aliases[0][2])
    unique_aliases.append(aliases[0])
    for tuple in aliases:
        texts1.append(tuple[0])
        texts2.append(tuple[1])
        if int(tuple[2]) != prev:
            unique_aliases.append(tuple)
            prev = int(tuple[2])

    print('Found %s texts.' % len(texts1))