def prepare_matches(chunk_fname, keys_out_fname, values_out_fname, pid=""):
    keys_out = open(keys_out_fname, "wb", buffering=1000000)
    values_out = open(values_out_fname, "wb", buffering=1000000)
    add_values_out = open(values_out_fname + ".add", "w", buffering=1000000)
    all_codes = []
    tokens = {}
    tokens_freqs = {}
    for line in open(chunk_fname):
        token, token_codes = line.strip().split("\t")
        token_codes = [int(code) for code in token_codes.split()]
        tokens.setdefault(token, []).append(len(all_codes))
        all_codes.append(token_codes)
        tokens_freqs.setdefault(token, 0)
        tokens_freqs[token] += len(token_codes)
    progress_counter = TCustomCounter("Reducer%s" % (str(pid)), sys.stdout, verbosity=1, interval=10000)
    for token, chunks in tokens.items():
        token_freq = tokens_freqs[token]
        if token_freq < MIN_WORD_FREQ_FOR_INDEX or token_freq > MAX_WORD_FREQ_FOR_INDEX:
            continue
        word_codes = []
        for chunk_index in chunks:
            word_codes += all_codes[chunk_index]
            all_codes[chunk_index] = []
        word_codes.sort()
        start_position = values_out.tell()
        for code in word_codes:
            values_out.write(pack("Q", code))
        pickle.dump((token, token_freq, start_position), keys_out)
        progress_counter.add()
Exemplo n.º 2
0
 def crawl_folder(self, folder):
     object_folders = crawl_folder(folder)
     import sys
     processed_counter = TCustomCounter("Crawler, found objects", sys.stderr, self.verbosity, 100)
     for object_folder, object_id in object_folders:
         fields2update = self.crawl_object_fields(object_folder, object_id)
         object2update = TIndexingObjectData(object_id=object_id,
                                             object_fields=fields2update)
         yield object2update
         processed_counter.add()
Exemplo n.º 3
0
 def crawl_csv(self, csv_file_path):
     field_index2name = {1:"year", 
                         2:"udc", 
                         #3:"class_level1", 
                         #4:"class_level2", 
                         #5:"class_level3",
                         6:"pages_count", 
                         7: "author", 
                         8:"title"  }
     hierarchy_indices = [3, 4, 5] 
     
     import sys
     processed_counter = TCustomCounter("Crawler, found objects", sys.stderr, self.verbosity, 1000)
     encoding = chardet.detect(open(csv_file_path).read())['encoding']
     all_hierarchy_codes = {}
     for line in open(csv_file_path):
         line = line.decode(encoding)
         field_values = line.strip().split(";")
         object_id = field_values[0]
         fields = []
         for field_index, field_id in field_index2name.items():
             if len(field_values) > field_index:
                 field_value_encoded = field_values[field_index].encode(DEFAULT_ENCODING)
                 fields.append(TIndexingObjectField(field_id, 
                                                    field_value=field_value_encoded, 
                                                    field_file_path=""))
         """ library section feature """   
         hierarchy_codes = []
         import hashlib
         hash = hashlib.md5()
         path = ""
         for hierarchy_feat_index in hierarchy_indices:
             node_name = field_values[hierarchy_feat_index].strip()
             if not node_name:
                 break
             hash.update(node_name.encode("utf8"))
             code = int(hash.hexdigest(), 16) % 1000000007
             path += node_name + ";"
             hierarchy_codes.append(code)
             if not code in all_hierarchy_codes:
                 all_hierarchy_codes[code] = path
             elif code in all_hierarchy_codes and all_hierarchy_codes[code] != path:
                 print "Hash collision:", path.encode("utf8"), "vs.", all_hierarchy_codes[code].encode("utf8")
                 print "FULL STOP"
                 exit()
                  
         fields.append(TIndexingObjectField(field_id=LIB_SECTION_FIELD, 
                                            field_value=hierarchy_codes, 
                                            field_file_path=""))
         
         object2update = TIndexingObjectData(object_id=object_id,
                                             object_fields=fields)
         yield object2update
         processed_counter.add()
     """