예제 #1
0
    def get_doc(self, filename, path, title, url, contents):
        '''
        Generate a `Document` according to the parameters.

        Input: `filename`: filename of the webpage
               `path`: path of the webpage
               `title`: title of the webpage
               `url`: original url of the webpage
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        # doc.add(Field("name", filename, self.property_type))
        # doc.add(Field("path", path, self.property_type))
        # doc.add(Field("title", title, self.property_type))
        # doc.add(Field("url", url, self.property_type))
        doc.add(StringField("name", filename, Field.Store.YES))
        doc.add(StringField("path", path, Field.Store.YES))
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc
예제 #2
0
파일: IndexFiles.py 프로젝트: elfdown/ee208
    def indexDocs(self, root, writer):   
        for root,dirnames,filenames in os.walk(root):
            for dirname in dirnames: #遍历文件夹
                path1 = os.path.join(root,dirname)
                for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件
                    for filename in filenames:
                        #print(root,dirnames,filename)
                        print("adding", filename)
                        # try:
                        path = os.path.join(path1, filename)
                        file = open(path, encoding='utf8')
                        page = file.readline()
                        title = file.readline()
                        contents = file.read()
                        file.close()

                        # jieba 分词
                        seg_contents = jieba.lcut_for_search(contents)
                        contents = ' '.join(seg_contents)
                        url = page
                        seg_url = jieba.lcut_for_search(page)
                        page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www'])))

                        doc = Document()
                        doc.add(StringField("name", filename, Field.Store.YES))
                        doc.add(StringField("path", path, Field.Store.YES))
                        if len(contents) > 0:
                            doc.add(TextField('title', title, Field.Store.YES))
                            doc.add(TextField('site', page, Field.Store.YES))
                            doc.add(TextField('url',url,Field.Store.YES))
                            doc.add(TextField('contents', contents, Field.Store.YES))
                        else:
                            print("warning: no content in %s" % filename)
                        writer.addDocument(doc)
예제 #3
0
def addDoc(w, name, birth_date, death_date, birth_note, death_note):
    doc = Document()
    doc.add(TextField("name", name, Field.Store.YES))
    doc.add(StringField("birth_date", birth_date, Field.Store.YES))
    doc.add(StringField("death_date", death_date, Field.Store.YES))
    doc.add(StringField("birth_note", birth_note, Field.Store.YES))
    doc.add(StringField("death_note", death_note, Field.Store.YES))
    w.addDocument(doc)
예제 #4
0
def createDocument(item_id, label, viewSimilar, viewProspective):
    doc = Document()
    doc.add(StringField('itemID', item_id, Field.Store.YES))
    doc.add(StringField('label', label, Field.Store.YES))
    for item in viewSimilar:
        doc.add(StoredField("viewSimilar", item))
    for item in viewProspective:
        doc.add(StoredField("viewProspective", item))
    return doc
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id,
                                         sentence):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(StringField("doc_id", str(doc_id), Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("sentence", sentence, Field.Store.YES))
    return doc
예제 #6
0
def create_doc(item_id, label, viewSimilar, viewProspective, model="default"):
    doc = Document()
    now_time = int(time.time())
    _id = hashlib.md5(f"{label}_{item_id}".encode('utf-8')).hexdigest()
    doc.add(StringField("id", _id, Field.Store.NO))
    doc.add(StringField("itemID", item_id, Field.Store.YES))
    doc.add(StringField("label", label, Field.Store.YES))
    doc.add(StoredField("viewSimilar", viewSimilar))
    doc.add(StoredField("viewProspective", viewProspective))
    doc.add(StringField("model", model, Field.Store.YES))
    doc.add(StringField("ttl", str(now_time), Field.Store.NO))
    return _id, doc
def obj_to_document(obj):
    def conv_to_str(x):
        if isinstance(x, unicode):
            return x.encode('utf8')
        return str(x)

    res = Document()
    res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES))
    res.add(StringField('type', obj.__class__.__name__, Field.Store.YES))
    for k, v in vars(obj.data).items():
        if v is None:
            res.add(Field(k, '', Field.Store.YES, Field.Index.NO))
            fieldtype = LT_NONE
        elif isinstance(v, list):
            if len(v) > 0 and isinstance(v[0], int):
                res.add(
                    TextField(k, ' '.join((str(x) for x in set(v))),
                              Field.Store.YES))
                fieldtype = LT_INTLIST
            else:
                res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES))
                fieldtype = LT_LIST
        elif isinstance(v, str) or isinstance(v, unicode):
            res.add(Field(k, v, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v)), Field.Store.NO))
            fieldtype = LT_STRING
        elif isinstance(v, hyper_text):
            res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v.text)),
                          Field.Store.NO))
            fieldtype = LT_HYPERTEXT
        elif isinstance(v, bool):
            if v:
                vs = '1'
            else:
                vs = '0'
            res.add(StringField(k, vs, Field.Store.YES))
            fieldtype = LT_BOOL
        elif isinstance(v, int) or isinstance(v, long):
            res.add(StringField(k, str(v), Field.Store.YES))
            fieldtype = LT_INT
        else:
            raise Exception('unrecognized data type')
        res.add(
            Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO))
    return res
예제 #8
0
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()
def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))
예제 #10
0
    def createDoc(self, url, html, duplicate):
        title, contents = self.parseHtml(url, html)

        doc = Document()
        doc.add(StringField("title", title, Field.Store.YES))
        doc.add(StringField("url", url, Field.Store.YES))
        doc.add(
            StringField("duplicate",
                        str(duplicate).lower(), Field.Store.YES))

        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print "Warning: No content in %s" % url

        return doc
    def add_doc(self, item_data):
        item_id = item_data['item_id']
        ttl = item_data['ttl']
        version = item_data.get('version', 'default')
        view_similar = json.dumps(item_data.get('view_similar', {}))
        view_prospective = json.dumps(item_data.get('view_prospective', {}))

        doc = Document()
        _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest()
        doc.add(StringField("id", _id, Field.Store.NO))
        doc.add(LongPoint("ttl", ttl))
        doc.add(StringField("version", version, Field.Store.YES))
        doc.add(StringField("item_id", item_id, Field.Store.YES))
        doc.add(StoredField("view_similar", view_similar))
        doc.add(StoredField("view_prospective", view_prospective))
        self.writer.updateDocument(Term("id", _id), doc)
def add_code_keyword_into_document(document, file_content, node, counter):
    # Flag is set when at least 1 code characteristics has been stored
    flag = False
    # document.add(Field("line_numbers", str(dict(node["line_numbers"])), Field.Store.YES, Field.Index.NO))
    # document.add(Field("hash", str(md5(file_content)), Field.Store.YES, Field.Index.NO))
    # document.add(Field("code", so_tokenizer(file_content, False), Field.Store.YES, Field.Index.ANALYZED))

    for m in node["typed_method_call"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.typed_method_call_count += 1
            flag = True

    for e in node["extends"]:
        if e:
            document.add(Field("word", e, Field.Store.NO,
                               Field.Index.ANALYZED))
            counter.extends_count += 1

    for c in node["used_classes"]:
        if c:
            document.add(
                Field("word", str(c), Field.Store.YES, Field.Index.ANALYZED))
            counter.used_classes_count += 1

    for i in node["class_instance_creation"]:
        if i:
            document.add(
                Field("word", i, Field.Store.YES, Field.Index.ANALYZED))
            counter.class_instance_creation_count += 1
            flag = True

    for m in node["methods"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.methods_count += 1

    for m in node["methods_called"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.methods_called_count += 1
            flag = True

    for m in node["unresolved_method_calls"]:
        if m:
            document.add(
                Field("word", m, Field.Store.YES, Field.Index.ANALYZED))
            counter.unresolved_method_calls_count += 1

    for l in node["literals"]:
        if l:
            document.add(StringField("word", l, Field.Store.YES))
            counter.literals_count += 1
            flag = True

    return flag
 def addDocument(self, id):
     global answers_train
     preA = answers_train[id]
     doc = Document()
     doc.add(TextField("pa", preA, Field.Store.YES))
     doc.add(StringField("id", str(id), Field.Store.YES))
     self.w.addDocument(doc)
     self.w.commit()
예제 #14
0
def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc
예제 #15
0
    def get_doc(self, doc_info, contents):
        '''
        Generate a `Document` according to the given info.

        Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`)
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("name", doc_info['name'], Field.Store.YES))
        doc.add(StringField("path", doc_info['path'], Field.Store.YES))
        doc.add(StringField("title", doc_info['title'], Field.Store.YES))
        doc.add(StringField("url", doc_info['url'], Field.Store.YES))
        doc.add(TextField("site", doc_info['site'], Field.Store.YES))
        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(doc_info['name']))
        return doc